genevalidator 1.6.12 → 2.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +30 -1
- data/.ruby-version +1 -0
- data/.travis.yml +13 -12
- data/Gemfile +4 -1
- data/Gemfile.lock +135 -0
- data/README.md +104 -122
- data/Rakefile +377 -5
- data/aux/gv_results.slim +155 -0
- data/aux/html_files/css/gv.compiled.min.css +8 -0
- data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
- data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
- data/aux/{files → html_files}/css/src/style.css +0 -0
- data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
- data/aux/{files → html_files}/img/gene.png +0 -0
- data/aux/html_files/js/gv.compiled.min.js +1 -0
- data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
- data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
- data/aux/{files → html_files}/js/src/plots.js +1 -1
- data/aux/{files → html_files}/js/src/script.js +0 -0
- data/aux/{files → html_files}/json/.gitkeep +0 -0
- data/bin/genevalidator +393 -56
- data/exemplar_data/README.md +60 -0
- data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
- data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
- data/genevalidator.gemspec +35 -20
- data/install.sh +92 -0
- data/lib/genevalidator.rb +171 -56
- data/lib/genevalidator/arg_validation.rb +26 -55
- data/lib/genevalidator/blast.rb +44 -99
- data/lib/genevalidator/clusterization.rb +18 -22
- data/lib/genevalidator/exceptions.rb +17 -17
- data/lib/genevalidator/ext/array.rb +21 -4
- data/lib/genevalidator/get_raw_sequences.rb +32 -31
- data/lib/genevalidator/hsp.rb +31 -2
- data/lib/genevalidator/json_to_gv_results.rb +38 -122
- data/lib/genevalidator/output.rb +158 -172
- data/lib/genevalidator/output_files.rb +134 -0
- data/lib/genevalidator/pool.rb +2 -5
- data/lib/genevalidator/query.rb +1 -1
- data/lib/genevalidator/tabular_parser.rb +8 -29
- data/lib/genevalidator/validation.rb +48 -90
- data/lib/genevalidator/validation_alignment.rb +64 -75
- data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
- data/lib/genevalidator/validation_duplication.rb +85 -84
- data/lib/genevalidator/validation_gene_merge.rb +46 -35
- data/lib/genevalidator/validation_length_cluster.rb +18 -15
- data/lib/genevalidator/validation_length_rank.rb +19 -15
- data/lib/genevalidator/validation_maker_qi.rb +13 -12
- data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
- data/lib/genevalidator/validation_report.rb +1 -1
- data/lib/genevalidator/validation_test.rb +1 -1
- data/lib/genevalidator/version.rb +1 -1
- data/test/overall.rb +1 -1
- data/test/test_all_validations.rb +36 -24
- data/test/test_blast.rb +39 -24
- data/test/test_clusterization_2d.rb +4 -4
- data/test/test_helper.rb +2 -2
- data/test/test_query.rb +16 -20
- data/test/test_validation_open_reading_frame.rb +122 -122
- data/test/test_validations.rb +12 -10
- metadata +94 -79
- data/aux/files/css/genevalidator.compiled.min.css +0 -16
- data/aux/files/js/genevalidator.compiled.min.js +0 -28
- data/aux/json_footer.erb +0 -8
- data/aux/json_header.erb +0 -19
- data/aux/json_query.erb +0 -15
- data/aux/template_footer.erb +0 -8
- data/aux/template_header.erb +0 -19
- data/aux/template_query.erb +0 -14
- data/data/README.md +0 -57
- data/data/mrna_data.fasta.blast_tabular +0 -3567
- data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
- data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
- data/data/mrna_data.fasta.blast_xml +0 -39800
- data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
- data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
- data/data/mrna_data.fasta.json +0 -1
- data/data/protein_data.fasta.blast_tabular +0 -3278
- data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
- data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
- data/data/protein_data.fasta.blast_xml +0 -26228
- data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
- data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
- data/data/protein_data.fasta.json +0 -1
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
module GeneValidator
|
|
2
2
|
# Exception raised when BLAST path is not added to the CLASSPATH
|
|
3
|
-
class ClasspathError <
|
|
3
|
+
class ClasspathError < RuntimeError
|
|
4
4
|
end
|
|
5
5
|
|
|
6
6
|
# Exception raised when the command line type argument
|
|
7
7
|
# does not corrsepond to the type of the sequences in the fasta file
|
|
8
|
-
class SequenceTypeError <
|
|
8
|
+
class SequenceTypeError < RuntimeError
|
|
9
9
|
def to_s
|
|
10
10
|
"\nSequence Type error: Possible cause include that the blast output" \
|
|
11
11
|
" was not obtained against a protein database.\n"
|
|
@@ -13,15 +13,15 @@ module GeneValidator
|
|
|
13
13
|
end
|
|
14
14
|
|
|
15
15
|
# Exception raised when an unexisting file is accessed
|
|
16
|
-
class FileNotFoundException <
|
|
16
|
+
class FileNotFoundException < RuntimeError
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
# Exception raised when blast does not find any hit
|
|
20
|
-
class QueryError <
|
|
20
|
+
class QueryError < RuntimeError
|
|
21
21
|
end
|
|
22
22
|
|
|
23
23
|
# Exception raised when a validation class is not instance of ValidationTest
|
|
24
|
-
class ValidationClassError <
|
|
24
|
+
class ValidationClassError < RuntimeError
|
|
25
25
|
def to_s
|
|
26
26
|
"\nClass Type error: Possible cause include that one of the validations" \
|
|
27
27
|
" is not a sub-class of ValidationTest\n"
|
|
@@ -30,7 +30,7 @@ module GeneValidator
|
|
|
30
30
|
|
|
31
31
|
# Exception raised when a validation report class is not instance of
|
|
32
32
|
# ValidationReport
|
|
33
|
-
class ReportClassError <
|
|
33
|
+
class ReportClassError < RuntimeError
|
|
34
34
|
def to_s
|
|
35
35
|
"\nClass Type error: Possible causes include that the type of one of" \
|
|
36
36
|
' the validation reports is not a subclass of the ValidationReport' \
|
|
@@ -40,20 +40,20 @@ module GeneValidator
|
|
|
40
40
|
|
|
41
41
|
# Exception raised when there are not enough blast hits to make a statistical
|
|
42
42
|
# validation
|
|
43
|
-
class NotEnoughHitsError <
|
|
43
|
+
class NotEnoughHitsError < RuntimeError
|
|
44
44
|
end
|
|
45
45
|
|
|
46
46
|
# Exception raised when function dependig on the internet connection raise
|
|
47
47
|
# Exception
|
|
48
|
-
class NoInternetError <
|
|
48
|
+
class NoInternetError < RuntimeError
|
|
49
49
|
end
|
|
50
50
|
|
|
51
51
|
# Exception raised when the alignment initialization raises exception
|
|
52
|
-
class NoMafftInstallationError <
|
|
52
|
+
class NoMafftInstallationError < RuntimeError
|
|
53
53
|
end
|
|
54
54
|
|
|
55
55
|
# Exception raised when the -v argument didn't filter any validatio test
|
|
56
|
-
class NoValidationError <
|
|
56
|
+
class NoValidationError < RuntimeError
|
|
57
57
|
def to_s
|
|
58
58
|
"\nValidation error: Possible cause inlcude that the -v arguments" \
|
|
59
59
|
" supplied is not valid\n"
|
|
@@ -61,7 +61,7 @@ module GeneValidator
|
|
|
61
61
|
end
|
|
62
62
|
|
|
63
63
|
# Exception raised when the are alias duplications
|
|
64
|
-
class AliasDuplicationError <
|
|
64
|
+
class AliasDuplicationError < RuntimeError
|
|
65
65
|
def to_s
|
|
66
66
|
"\nAlias Duplication error: Possible cause: At least two validations" \
|
|
67
67
|
" have the same CLI alias\n"
|
|
@@ -69,27 +69,27 @@ module GeneValidator
|
|
|
69
69
|
end
|
|
70
70
|
|
|
71
71
|
# Exception raised when the BLAST is not set up with the '-parse-seqids' arg.
|
|
72
|
-
class BLASTDBError <
|
|
72
|
+
class BLASTDBError < RuntimeError
|
|
73
73
|
end
|
|
74
74
|
|
|
75
75
|
# Error raised by QI Validation when the query does not have QI tag
|
|
76
|
-
class NotEnoughEvidence <
|
|
76
|
+
class NotEnoughEvidence < RuntimeError
|
|
77
77
|
end
|
|
78
78
|
|
|
79
79
|
# Exception raised when the are alias duplications
|
|
80
|
-
class NoPIdentError <
|
|
80
|
+
class NoPIdentError < RuntimeError
|
|
81
81
|
end
|
|
82
82
|
|
|
83
83
|
# Exception raised when the tabular format does not correspond to the tabular
|
|
84
84
|
# argument
|
|
85
|
-
class InconsistentTabularFormat <
|
|
85
|
+
class InconsistentTabularFormat < RuntimeError
|
|
86
86
|
end
|
|
87
87
|
|
|
88
88
|
# Exception raised when there are more than one reading frame among the hits
|
|
89
89
|
# of one prediction
|
|
90
|
-
class ReadingFrameError <
|
|
90
|
+
class ReadingFrameError < RuntimeError
|
|
91
91
|
end
|
|
92
92
|
|
|
93
|
-
class OtherError <
|
|
93
|
+
class OtherError < RuntimeError
|
|
94
94
|
end
|
|
95
95
|
end
|
|
@@ -9,15 +9,15 @@ module GeneValidator
|
|
|
9
9
|
sum / length.to_f
|
|
10
10
|
end
|
|
11
11
|
|
|
12
|
-
def median
|
|
13
|
-
sorted = sort
|
|
12
|
+
def median(already_sorted = false)
|
|
13
|
+
sorted = already_sorted ? self : sort
|
|
14
14
|
len = sorted.length
|
|
15
15
|
(sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
|
|
16
16
|
end
|
|
17
17
|
|
|
18
18
|
def mode
|
|
19
|
-
freq =
|
|
20
|
-
|
|
19
|
+
freq = each_with_object(Hash.new(0)) { |v, h| h[v] += 1; }
|
|
20
|
+
max_by { |v| freq[v] }
|
|
21
21
|
end
|
|
22
22
|
|
|
23
23
|
def sample_variance
|
|
@@ -29,6 +29,23 @@ module GeneValidator
|
|
|
29
29
|
def standard_deviation
|
|
30
30
|
Math.sqrt(sample_variance)
|
|
31
31
|
end
|
|
32
|
+
|
|
33
|
+
def all_quartiles
|
|
34
|
+
sorted = sort
|
|
35
|
+
len = sorted.length
|
|
36
|
+
split = sorted.median_split
|
|
37
|
+
[
|
|
38
|
+
split[0].median(true),
|
|
39
|
+
sorted.median(true),
|
|
40
|
+
split[1].median(true)
|
|
41
|
+
]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def median_split
|
|
45
|
+
len = length
|
|
46
|
+
center = len % 2
|
|
47
|
+
[self[0..len / 2 - 1], self[len / 2 + center..-1]]
|
|
48
|
+
end
|
|
32
49
|
end
|
|
33
50
|
end
|
|
34
51
|
|
|
@@ -13,24 +13,25 @@ module GeneValidator
|
|
|
13
13
|
class RawSequences
|
|
14
14
|
class <<self
|
|
15
15
|
extend Forwardable
|
|
16
|
-
def_delegators GeneValidator, :opt, :config
|
|
16
|
+
def_delegators GeneValidator, :opt, :config, :dirs
|
|
17
17
|
|
|
18
18
|
def init
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
warn '==> Extracting fasta sequences for each BLAST HSP from the' \
|
|
20
|
+
' BLAST database'
|
|
21
21
|
|
|
22
22
|
@blast_file = opt[:blast_xml_file] if opt[:blast_xml_file]
|
|
23
23
|
@blast_file = opt[:blast_tabular_file] if opt[:blast_tabular_file]
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
fname = File.basename(@blast_file)
|
|
26
|
+
opt[:raw_sequences] = File.join(dirs[:tmp_dir], "#{fname}.raw_seq")
|
|
27
|
+
@index_file = File.join(dirs[:tmp_dir], "#{fname}.index")
|
|
27
28
|
end
|
|
28
29
|
|
|
29
30
|
##
|
|
30
31
|
# Obtains raw_sequences from BLAST output file...
|
|
31
32
|
def run
|
|
32
33
|
init
|
|
33
|
-
if opt[:db]
|
|
34
|
+
if opt[:db].match?(/remote/)
|
|
34
35
|
write_a_raw_seq_file(opt[:raw_sequences], 'remote')
|
|
35
36
|
else
|
|
36
37
|
write_an_index_file(@index_file, 'local')
|
|
@@ -55,12 +56,13 @@ module GeneValidator
|
|
|
55
56
|
index_hash = {}
|
|
56
57
|
keys.each_with_index do |k, i|
|
|
57
58
|
start = values[i]
|
|
58
|
-
endf =
|
|
59
|
+
endf = i == values.length - 1 ? content.length - 1 : values[i + 1]
|
|
59
60
|
index_hash[k] = [start, endf]
|
|
60
61
|
end
|
|
61
62
|
|
|
62
63
|
# create FASTA index
|
|
63
|
-
|
|
64
|
+
fname = File.basename(raw_seq_file)
|
|
65
|
+
config[:raw_seq_file_index] = File.join(dirs[:tmp_dir], "#{fname}.idx")
|
|
64
66
|
config[:raw_seq_file_load] = index_hash
|
|
65
67
|
|
|
66
68
|
File.open(config[:raw_seq_file_index], 'w') do |f|
|
|
@@ -76,29 +78,29 @@ module GeneValidator
|
|
|
76
78
|
iterate_xml(file, db_type) if opt[:blast_xml_file]
|
|
77
79
|
iterate_tabular(file, db_type) if opt[:blast_tabular_file]
|
|
78
80
|
rescue BLASTDBError
|
|
79
|
-
|
|
81
|
+
warn '*** BLAST Database Error: Genevalidator requires BLAST' \
|
|
80
82
|
" databases to be created with the '-parse_seqids argument."
|
|
81
|
-
|
|
82
|
-
|
|
83
|
+
warn ' See https://github.com/wurmlab/genevalidator' \
|
|
84
|
+
'#setting-up-a-blast-database for more information'
|
|
83
85
|
exit 1
|
|
84
|
-
rescue
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
86
|
+
rescue StandardError
|
|
87
|
+
warn '*** Error: There was an error in analysing the BLAST'
|
|
88
|
+
warn ' output file. Please ensure that BLAST output file'
|
|
89
|
+
warn ' is in the correct format and then try again. If you'
|
|
90
|
+
warn ' are using a remote database, please ensure that you'
|
|
91
|
+
warn ' have internet access.'
|
|
90
92
|
exit 1
|
|
91
93
|
ensure
|
|
92
94
|
file.close unless file.nil?
|
|
93
95
|
end
|
|
94
96
|
|
|
95
|
-
|
|
97
|
+
alias write_a_raw_seq_file write_an_index_file
|
|
96
98
|
|
|
97
99
|
def iterate_xml(file, db_type)
|
|
98
100
|
n = Bio::BlastXMLParser::XmlIterator.new(opt[:blast_xml_file]).to_enum
|
|
99
101
|
n.each do |iter|
|
|
100
102
|
iter.each do |hit|
|
|
101
|
-
|
|
103
|
+
raise BLASTDBError if hit.hit_id =~ /\|BL_ORD_ID\|/
|
|
102
104
|
if db_type == 'remote' || hit.hit_id.nil?
|
|
103
105
|
file.puts FetchRawSequences.extract_from_remote_db(hit.accession)
|
|
104
106
|
else
|
|
@@ -116,7 +118,7 @@ module GeneValidator
|
|
|
116
118
|
headers: table_headers)
|
|
117
119
|
|
|
118
120
|
rows.each do |row|
|
|
119
|
-
|
|
121
|
+
raise BLASTDBError if row['sseqid'] =~ /\|BL_ORD_ID\|/i
|
|
120
122
|
if db_type == 'remote' || row['sseqid'].nil?
|
|
121
123
|
file.puts FetchRawSequences.extract_from_remote_db(row['sacc'])
|
|
122
124
|
else
|
|
@@ -136,15 +138,15 @@ module GeneValidator
|
|
|
136
138
|
# first try to extract from previously created raw_sequences HASH
|
|
137
139
|
raw_seq = extract_from_index(identifier) if opt[:raw_sequences]
|
|
138
140
|
# then try to just extract that sequence based on accession.
|
|
139
|
-
if opt[:db] !~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/)
|
|
141
|
+
if opt[:db] !~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/i)
|
|
140
142
|
raw_seq = extract_from_local_db(false, accession)
|
|
141
143
|
end
|
|
142
144
|
# then try to extract from remote database
|
|
143
|
-
if opt[:db] =~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/)
|
|
145
|
+
if opt[:db] =~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/i)
|
|
144
146
|
raw_seq = extract_from_remote_db(accession)
|
|
145
147
|
end
|
|
146
148
|
# return nil if the raw_sequence still produces an error.
|
|
147
|
-
|
|
149
|
+
raw_seq =~ /Error/i ? nil : raw_seq
|
|
148
150
|
end
|
|
149
151
|
|
|
150
152
|
##
|
|
@@ -157,8 +159,8 @@ module GeneValidator
|
|
|
157
159
|
idx = config[:raw_seq_file_load][identifier]
|
|
158
160
|
query = IO.binread(opt[:raw_sequences], idx[1] - idx[0], idx[0])
|
|
159
161
|
parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
|
|
160
|
-
parse_query[1].
|
|
161
|
-
rescue
|
|
162
|
+
parse_query[1].delete("\n")
|
|
163
|
+
rescue StandardError
|
|
162
164
|
'Error' # return error so it can then try alternative fetching method.
|
|
163
165
|
end
|
|
164
166
|
|
|
@@ -170,7 +172,7 @@ module GeneValidator
|
|
|
170
172
|
# Output:
|
|
171
173
|
# String with the nucleotide sequence corresponding to the accession
|
|
172
174
|
def extract_from_local_db(batch, accno = nil, idx_file = nil)
|
|
173
|
-
cmd =
|
|
175
|
+
cmd = batch ? batch_raw_seq_cmd(idx_file) : single_raw_seq_cmd(accno)
|
|
174
176
|
efile = Tempfile.new('blast_out')
|
|
175
177
|
`#{cmd} &>#{efile.path}`
|
|
176
178
|
raw_seqs = efile.read
|
|
@@ -193,9 +195,9 @@ module GeneValidator
|
|
|
193
195
|
def failed_raw_sequences(blast_output)
|
|
194
196
|
blast_output.each_line do |line|
|
|
195
197
|
acc = line.match(/Error: (\w+): OID not found/)[1]
|
|
196
|
-
|
|
198
|
+
warn "\nCould not find sequence '#{acc.chomp}' within the" \
|
|
197
199
|
' BLAST database.'
|
|
198
|
-
|
|
200
|
+
warn "Attempting to obtain sequence '#{acc.chomp}' from" \
|
|
199
201
|
' remote BLAST databases.'
|
|
200
202
|
File.open(opt[:raw_sequences], 'a+') do |f|
|
|
201
203
|
f.puts extract_from_remote_db(acc)
|
|
@@ -204,13 +206,12 @@ module GeneValidator
|
|
|
204
206
|
end
|
|
205
207
|
|
|
206
208
|
def extract_from_remote_db(accession, db_seq_type = 'protein')
|
|
207
|
-
uri = '
|
|
209
|
+
uri = 'https://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' \
|
|
208
210
|
"db=#{db_seq_type}&retmax=1&usehistory=y&term=#{accession}/"
|
|
209
211
|
result = Net::HTTP.get(URI.parse(uri))
|
|
210
212
|
query = result.match(%r{<\bQueryKey\b>([\w\W\d]+)</\bQueryKey\b>})[1]
|
|
211
213
|
web_env = result.match(%r{<\bWebEnv\b>([\w\W\d]+)</\bWebEnv\b>})[1]
|
|
212
|
-
|
|
213
|
-
uri = 'http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' \
|
|
214
|
+
uri = 'https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' \
|
|
214
215
|
'rettype=fasta&retmode=text&retstart=0&retmax=1&' \
|
|
215
216
|
"db=#{db_seq_type}&query_key=#{query}&WebEnv=#{web_env}"
|
|
216
217
|
result = Net::HTTP.get(URI.parse(uri))
|
data/lib/genevalidator/hsp.rb
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
|
+
require 'forwardable'
|
|
2
|
+
|
|
1
3
|
require 'genevalidator/blast'
|
|
2
4
|
require 'genevalidator/exceptions'
|
|
3
5
|
|
|
4
6
|
module GeneValidator
|
|
5
7
|
# A class that initialises the BLAST tabular attributes
|
|
6
8
|
class Hsp
|
|
9
|
+
extend Forwardable
|
|
10
|
+
def_delegators GeneValidator, :config
|
|
11
|
+
|
|
7
12
|
attr_accessor :hit_from # ref. from the unaligned hit sequence
|
|
8
13
|
attr_accessor :hit_to
|
|
9
14
|
attr_accessor :match_query_from # ref. from the unaligned query sequence
|
|
@@ -23,9 +28,30 @@ module GeneValidator
|
|
|
23
28
|
attr_accessor :gaps
|
|
24
29
|
attr_accessor :align_len
|
|
25
30
|
|
|
26
|
-
def initialize
|
|
31
|
+
def initialize(input = {})
|
|
27
32
|
@query_alignment = nil
|
|
28
33
|
@hit_alignment = nil
|
|
34
|
+
init_xml_attributes(input[:xml_input]) if input[:xml_input]
|
|
35
|
+
init_tabular_attribute(input[:tabular_input]) if input[:tabular_input]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def init_xml_attributes(hsp)
|
|
39
|
+
@match_query_from = hsp.query_from.to_i
|
|
40
|
+
@match_query_to = hsp.query_to.to_i
|
|
41
|
+
@query_reading_frame = hsp.query_frame.to_i
|
|
42
|
+
@hit_from = hsp.hit_from.to_i
|
|
43
|
+
@hit_to = hsp.hit_to.to_i
|
|
44
|
+
@query_alignment = hsp.qseq.to_s
|
|
45
|
+
@hit_alignment = hsp.hseq.to_s
|
|
46
|
+
@align_len = hsp.align_len.to_i
|
|
47
|
+
@pidentity = (100 * hsp.identity / hsp.align_len.to_f).round(2)
|
|
48
|
+
@identity = hsp.identity.to_i
|
|
49
|
+
@hsp_evalue = format('%.0e', hsp.evalue)
|
|
50
|
+
assert_seq_type(@hit_alignment) if @hit_alignment
|
|
51
|
+
assert_seq_type(@query_alignment) if @query_alignment
|
|
52
|
+
return unless config[:type] == :nucleotide
|
|
53
|
+
@match_query_from = (@match_query_from / 3) + 1
|
|
54
|
+
@match_query_to = (@match_query_to / 3) + 1
|
|
29
55
|
end
|
|
30
56
|
|
|
31
57
|
##
|
|
@@ -52,7 +78,10 @@ module GeneValidator
|
|
|
52
78
|
|
|
53
79
|
def assert_seq_type(query)
|
|
54
80
|
seq_type = BlastUtils.guess_sequence_type(query)
|
|
55
|
-
|
|
81
|
+
raise SequenceTypeError if seq_type != :protein
|
|
82
|
+
rescue SequenceTypeError => e
|
|
83
|
+
warn e
|
|
84
|
+
exit 1
|
|
56
85
|
end
|
|
57
86
|
end
|
|
58
87
|
end
|
|
@@ -1,10 +1,7 @@
|
|
|
1
|
-
require 'erb'
|
|
2
|
-
require 'fileutils'
|
|
3
1
|
require 'forwardable'
|
|
4
2
|
require 'json'
|
|
5
3
|
|
|
6
4
|
require 'genevalidator'
|
|
7
|
-
require 'genevalidator/output'
|
|
8
5
|
require 'genevalidator/version'
|
|
9
6
|
|
|
10
7
|
module GeneValidator
|
|
@@ -12,139 +9,58 @@ module GeneValidator
|
|
|
12
9
|
class JsonToGVResults
|
|
13
10
|
class << self
|
|
14
11
|
extend Forwardable
|
|
15
|
-
def_delegators GeneValidator, :opt
|
|
12
|
+
def_delegators GeneValidator, :opt, :config, :dirs
|
|
16
13
|
|
|
17
|
-
def init
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
aux: File.expand_path(File.join(File.dirname(__FILE__), '../../aux')),
|
|
23
|
-
filename: File.basename(@opt[:json_file]),
|
|
24
|
-
output_max: 2500,
|
|
25
|
-
run_no: 0
|
|
26
|
-
}
|
|
27
|
-
@json_array = load_json_file
|
|
14
|
+
def init(opt)
|
|
15
|
+
GeneValidator.opt = opt
|
|
16
|
+
GeneValidator.config = { output_max: 2500, run_no: 0,
|
|
17
|
+
json_output: load_json_file }
|
|
18
|
+
GeneValidator.dirs = GeneValidator.setup_dirnames(opt[:json_file])
|
|
28
19
|
end
|
|
29
20
|
|
|
30
21
|
def run
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
output_html = output_filename
|
|
37
|
-
generate_html_header(output_html) unless File.exist?(output_html)
|
|
38
|
-
generate_html_query(output_html, row)
|
|
22
|
+
warn '==> Parsing input JSON results'
|
|
23
|
+
print_console_header(config[:json_output][0])
|
|
24
|
+
config[:json_output].each do |row|
|
|
25
|
+
print_output_console(row)
|
|
26
|
+
create_row_json_plot_files(row)
|
|
39
27
|
end
|
|
40
|
-
|
|
41
|
-
calculate_overall_score
|
|
28
|
+
GeneValidator.produce_output
|
|
42
29
|
end
|
|
43
30
|
|
|
44
|
-
def
|
|
45
|
-
|
|
46
|
-
|
|
31
|
+
def print_console_header(first_row)
|
|
32
|
+
return unless opt[:output_formats].include? 'stdout'
|
|
33
|
+
return if config[:console_header_printed]
|
|
34
|
+
config[:console_header_printed] = true
|
|
35
|
+
warn '' # blank line
|
|
36
|
+
c_fmt = "%3s\t%5s\t%20s\t%7s\t"
|
|
37
|
+
print format(c_fmt, 'No', 'Score', 'Identifier', 'No_Hits')
|
|
38
|
+
puts first_row[:validations].keys.join("\t")
|
|
47
39
|
end
|
|
48
40
|
|
|
49
|
-
def
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
File.join(@config[:html_path], "results#{i}.html")
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
def generate_html_header(output_html)
|
|
61
|
-
return if File.exist?(output_html)
|
|
62
|
-
json_header_template = File.join(@config[:aux], 'json_header.erb')
|
|
63
|
-
template_contents = File.open(json_header_template, 'r').read
|
|
64
|
-
erb = ERB.new(template_contents, 0, '>')
|
|
65
|
-
File.open(output_html, 'w+') { |f| f.write(erb.result(binding)) }
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
def generate_html_query(output_html, row)
|
|
69
|
-
@row = row
|
|
70
|
-
json_query_template = File.join(@config[:aux], 'json_query.erb')
|
|
71
|
-
template_contents = File.open(json_query_template, 'r').read
|
|
72
|
-
erb = ERB.new(template_contents, 0, '>')
|
|
73
|
-
File.open(output_html, 'a') { |f| f.write(erb.result(binding)) }
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
# Add footer to all output files
|
|
77
|
-
def html_footer
|
|
78
|
-
no_of_output_files = (@config[:run_no].to_f / @config[:output_max]).ceil
|
|
79
|
-
|
|
80
|
-
output_files = []
|
|
81
|
-
(1..no_of_output_files).each { |i| output_files << "results#{i}.html" }
|
|
82
|
-
|
|
83
|
-
write_html_footer(no_of_output_files, output_files)
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
def write_html_footer(no_of_output_files, output_files)
|
|
87
|
-
turn_off_automated_sorting
|
|
88
|
-
json_footer_template = File.join(@config[:aux], 'json_footer.erb')
|
|
89
|
-
template_contents = File.open(json_footer_template, 'r').read
|
|
90
|
-
erb = ERB.new(template_contents, 0, '>')
|
|
91
|
-
(1..no_of_output_files).each do |i|
|
|
92
|
-
results_html = File.join(@config[:html_path], "results#{i}.html")
|
|
93
|
-
File.open(results_html, 'a+') { |f| f.write(erb.result(binding)) }
|
|
41
|
+
def print_output_console(row)
|
|
42
|
+
return unless opt[:output_formats].include? 'stdout'
|
|
43
|
+
c_fmt = "%3s\t%5s\t%20s\t%7s\t"
|
|
44
|
+
short_def = row[:definition].split(' ')[0]
|
|
45
|
+
print format(c_fmt, row[:idx], row[:overall_score], short_def,
|
|
46
|
+
row[:no_hits])
|
|
47
|
+
puts row[:validations].values.map { |e| e[:print] }.join("\t")
|
|
48
|
+
.gsub(' ', ' ')
|
|
94
49
|
end
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
# By default, on page load, the results are automatically sorted by the
|
|
98
|
-
# index. However since the whole idea is that users would sort by JSON,
|
|
99
|
-
# this is not wanted here.
|
|
100
|
-
def turn_off_automated_sorting
|
|
101
|
-
script_file = File.join(@config[:html_path],
|
|
102
|
-
'files/js/genevalidator.compiled.min.js')
|
|
103
|
-
original_content = File.read(script_file)
|
|
104
|
-
# removes the automatic sort on page load
|
|
105
|
-
updated_content = original_content.gsub(',sortList:[[0,0]]', '')
|
|
106
|
-
File.open("#{script_file}.tmp", 'w') { |f| f.puts updated_content }
|
|
107
|
-
FileUtils.mv("#{script_file}.tmp", script_file)
|
|
108
|
-
end
|
|
109
50
|
|
|
110
|
-
|
|
111
|
-
scores = []
|
|
112
|
-
@json_array.each { |row| scores << row['overall_score'] }
|
|
113
|
-
plot_dir = File.join(@config[:html_path], 'files/json')
|
|
114
|
-
less = generate_evaluation(scores)
|
|
115
|
-
Output.create_overview_json(scores, plot_dir, less, less)
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
def generate_evaluation(scores)
|
|
119
|
-
no_of_queries = scores.length
|
|
120
|
-
good_scores = scores.count { |s| s >= 75 }
|
|
121
|
-
bad_scores = scores.count { |s| s < 75 }
|
|
122
|
-
nee = calculate_no_quries_with_no_evidence # nee = no evidence
|
|
51
|
+
private
|
|
123
52
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
' were:<br>' \
|
|
129
|
-
"#{good_pred} good prediction(s),<br>" \
|
|
130
|
-
"#{bad_pred} possibly weak prediction(s).<br>"
|
|
131
|
-
return eval if nee == 0
|
|
132
|
-
eval << "#{nee} could not be evaluated due to the lack of" \
|
|
133
|
-
' evidence.<br>'
|
|
134
|
-
eval
|
|
135
|
-
end
|
|
53
|
+
def load_json_file
|
|
54
|
+
json_contents = File.read(File.expand_path(opt[:json_file]))
|
|
55
|
+
JSON.parse(json_contents, symbolize_names: true)
|
|
56
|
+
end
|
|
136
57
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
if status.count { |r| r == 'warning' } == status.length
|
|
143
|
-
all_warnings += 1
|
|
144
|
-
end
|
|
58
|
+
def create_row_json_plot_files(row)
|
|
59
|
+
config[:run_no] += 1
|
|
60
|
+
fname = "#{dirs[:filename]}_#{row[:idx]}.json"
|
|
61
|
+
json_file = File.join(dirs[:json_dir], fname)
|
|
62
|
+
File.open(json_file, 'w') { |f| f.write(row.to_json) }
|
|
145
63
|
end
|
|
146
|
-
all_warnings
|
|
147
|
-
end
|
|
148
64
|
end
|
|
149
65
|
end
|
|
150
66
|
end
|