genevalidator 1.6.1 → 1.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/.travis.yml +2 -0
- data/README.md +78 -30
- data/Rakefile +11 -8
- data/aux/app_template_footer.erb +1 -6
- data/aux/app_template_header.erb +12 -32
- data/aux/files/css/style.css +2 -8
- data/aux/files/js/plots.js +564 -576
- data/aux/files/js/script.js +10 -0
- data/aux/json_footer.erb +8 -0
- data/aux/json_header.erb +19 -0
- data/aux/json_query.erb +14 -0
- data/aux/template_footer.erb +9 -58
- data/aux/template_header.erb +18 -58
- data/aux/template_query.erb +8 -36
- data/bin/genevalidator +45 -32
- data/genevalidator.gemspec +11 -7
- data/lib/genevalidator.rb +75 -455
- data/lib/genevalidator/arg_validation.rb +78 -107
- data/lib/genevalidator/blast.rb +57 -60
- data/lib/genevalidator/clusterization.rb +15 -15
- data/lib/genevalidator/exceptions.rb +32 -5
- data/lib/genevalidator/get_raw_sequences.rb +70 -33
- data/lib/genevalidator/hsp.rb +1 -4
- data/lib/genevalidator/json_to_gv_results.rb +109 -0
- data/lib/genevalidator/output.rb +177 -185
- data/lib/genevalidator/pool.rb +2 -1
- data/lib/genevalidator/sequences.rb +3 -3
- data/lib/genevalidator/tabular_parser.rb +24 -18
- data/lib/genevalidator/validation.rb +279 -0
- data/lib/genevalidator/validation_alignment.rb +31 -47
- data/lib/genevalidator/validation_blast_reading_frame.rb +19 -18
- data/lib/genevalidator/validation_duplication.rb +23 -19
- data/lib/genevalidator/validation_gene_merge.rb +30 -65
- data/lib/genevalidator/validation_length_cluster.rb +14 -53
- data/lib/genevalidator/validation_length_rank.rb +10 -11
- data/lib/genevalidator/validation_open_reading_frame.rb +18 -19
- data/lib/genevalidator/validation_report.rb +2 -5
- data/lib/genevalidator/validation_test.rb +8 -4
- data/lib/genevalidator/version.rb +1 -1
- data/test/test_all_validations.rb +51 -66
- data/test/test_blast.rb +68 -51
- data/test/test_clusterization.rb +1 -1
- data/test/test_clusterization_2d.rb +19 -13
- data/test/test_extended_array_methods.rb +1 -1
- data/test/test_files/all_validations_mrna/mrna.blast_tab6 +1806 -0
- data/test/test_files/all_validations_mrna/mrna.blast_tab7 +1865 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml → mrna.blast_xml} +18642 -1
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.index → mrna.blast_xml.index} +300 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta → mrna.fa} +0 -0
- data/test/test_files/all_validations_mrna/mrna.raw_seq +3970 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.raw_seq.idx → mrna.raw_seq.idx} +901 -1
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_tab → prot.blast_tab6} +416 -0
- data/test/test_files/all_validations_prot/prot.blast_tab7 +2400 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml → prot.blast_xml} +18299 -6723
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.index → prot.blast_xml.index} +408 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta → prot.fa} +0 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq → prot.raw_seq} +2735 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq.idx → prot.raw_seq.idx} +3032 -1808
- data/test/test_sequences.rb +46 -41
- data/test/test_validation_open_reading_frame.rb +318 -202
- data/test/test_validations.rb +48 -32
- metadata +76 -102
- data/doc/AliasDuplicationError.html +0 -134
- data/doc/AlignmentValidation.html +0 -1687
- data/doc/AlignmentValidationOutput.html +0 -659
- data/doc/Blast.html +0 -1905
- data/doc/BlastRFValidationOutput.html +0 -545
- data/doc/BlastReadingFrameValidation.html +0 -370
- data/doc/BlastUtils.html +0 -875
- data/doc/ClasspathError.html +0 -134
- data/doc/Cluster.html +0 -1316
- data/doc/DuplciationValidationOutput.html +0 -564
- data/doc/DuplicationValidation.html +0 -920
- data/doc/DuplicationValidationOutput.html +0 -564
- data/doc/FileNotFoundException.html +0 -134
- data/doc/GeneMergeValidation.html +0 -935
- data/doc/GeneMergeValidationOutput.html +0 -652
- data/doc/HierarchicalClusterization.html +0 -994
- data/doc/Hsp.html +0 -1485
- data/doc/InconsistentTabularFormat.html +0 -135
- data/doc/LengthClusterValidation.html +0 -982
- data/doc/LengthClusterValidationOutput.html +0 -515
- data/doc/LengthRankValidation.html +0 -496
- data/doc/LengthRankValidationOutput.html +0 -517
- data/doc/NoInternetError.html +0 -135
- data/doc/NoMafftInstallationError.html +0 -134
- data/doc/NoPIdentError.html +0 -134
- data/doc/NoValidationError.html +0 -134
- data/doc/NotEnoughHitsError.html +0 -135
- data/doc/ORFValidationOutput.html +0 -593
- data/doc/OpenReadingFrameValidation.html +0 -1107
- data/doc/OtherError.html +0 -123
- data/doc/Output.html +0 -1540
- data/doc/Pair.html +0 -309
- data/doc/PairCluster.html +0 -767
- data/doc/Plot.html +0 -837
- data/doc/QueryError.html +0 -134
- data/doc/ReportClassError.html +0 -135
- data/doc/Sequence.html +0 -1299
- data/doc/SequenceTypeError.html +0 -135
- data/doc/TabularEntry.html +0 -837
- data/doc/TabularParser.html +0 -1104
- data/doc/Validation.html +0 -2147
- data/doc/ValidationClassError.html +0 -134
- data/doc/ValidationOutput.html +0 -460
- data/doc/ValidationReport.html +0 -940
- data/doc/ValidationTest.html +0 -939
- data/doc/_index.html +0 -449
- data/doc/class_list.html +0 -54
- data/doc/css/common.css +0 -1
- data/doc/css/full_list.css +0 -57
- data/doc/css/style.css +0 -338
- data/doc/file.README.html +0 -151
- data/doc/file_list.html +0 -56
- data/doc/frames.html +0 -26
- data/doc/index.html +0 -151
- data/doc/js/app.js +0 -214
- data/doc/js/full_list.js +0 -178
- data/doc/js/jquery.js +0 -4
- data/doc/method_list.html +0 -1505
- data/doc/top-level-namespace.html +0 -112
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.index +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq +0 -4929
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq.idx +0 -1006
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_xml.raw_seq +0 -2075
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.index +0 -1864
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq +0 -42411
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq.idx +0 -3751
@@ -1,5 +1,7 @@
|
|
1
1
|
require 'net/http'
|
2
2
|
require 'io/console'
|
3
|
+
require 'yaml'
|
4
|
+
|
3
5
|
module GeneValidator
|
4
6
|
# This is a class for the storing data on each sequence
|
5
7
|
class Sequence
|
@@ -65,7 +67,7 @@ module GeneValidator
|
|
65
67
|
end
|
66
68
|
@raw_sequence = seq
|
67
69
|
else
|
68
|
-
|
70
|
+
$stderr.puts "Getting sequence for '#{accno}' from NCBI - avoid this with '-r'."
|
69
71
|
uri = 'http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?'\
|
70
72
|
"db=#{dbtype}&retmax=1&usehistory=y&term=#{accno}/"
|
71
73
|
result = Net::HTTP.get(URI.parse(uri))
|
@@ -85,8 +87,6 @@ module GeneValidator
|
|
85
87
|
@raw_sequence = '' unless @raw_sequence.index(/ERROR/).nil?
|
86
88
|
end
|
87
89
|
@raw_sequence
|
88
|
-
# rescue Exception => error
|
89
|
-
# @raw_sequence = ""
|
90
90
|
end
|
91
91
|
|
92
92
|
##
|
@@ -1,5 +1,9 @@
|
|
1
|
-
require 'genevalidator/exceptions'
|
2
1
|
require 'csv'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
require 'genevalidator/sequences'
|
5
|
+
require 'genevalidator/hsp'
|
6
|
+
require 'genevalidator/exceptions'
|
3
7
|
|
4
8
|
#
|
5
9
|
module GeneValidator
|
@@ -8,6 +12,9 @@ module GeneValidator
|
|
8
12
|
##
|
9
13
|
# This class parses the tabular output of BLAST (outfmt 6 & 7)
|
10
14
|
class TabularParser
|
15
|
+
extend Forwardable
|
16
|
+
def_delegators GeneValidator, :opt, :config
|
17
|
+
|
11
18
|
attr_reader :rows
|
12
19
|
attr_reader :tab_results
|
13
20
|
attr_reader :column_names
|
@@ -15,28 +22,25 @@ module GeneValidator
|
|
15
22
|
|
16
23
|
##
|
17
24
|
# Initializes the object
|
18
|
-
|
19
|
-
|
20
|
-
# +type+: :nucleotide or :mrna
|
21
|
-
def initialize(filename, format, type)
|
25
|
+
def initialize(tab_file = opt[:blast_tabular_file],
|
26
|
+
format = opt[:blast_tabular_options], type = config[:type])
|
22
27
|
@column_names = format.gsub(/[-\d]/, '').split(/[ ,]/)
|
23
|
-
@tab_results = analayse_tabular_file(filename)
|
24
|
-
@rows = @tab_results.to_enum
|
25
28
|
@type = type
|
29
|
+
@tab_results = analayse_tabular_file(tab_file)
|
30
|
+
@rows = @tab_results.to_enum
|
26
31
|
end
|
27
32
|
|
28
33
|
##
|
29
34
|
#
|
30
35
|
def analayse_tabular_file(filename)
|
31
|
-
|
32
|
-
file
|
33
|
-
lines
|
34
|
-
|
35
|
-
headers: @column_names)
|
36
|
+
results = []
|
37
|
+
file = File.read(filename)
|
38
|
+
lines = CSV.parse(file, col_sep: "\t", skip_lines: /^#/,
|
39
|
+
headers: @column_names)
|
36
40
|
lines.each do |line|
|
37
|
-
|
41
|
+
results << line.to_hash
|
38
42
|
end
|
39
|
-
|
43
|
+
results
|
40
44
|
end
|
41
45
|
|
42
46
|
##
|
@@ -58,17 +62,19 @@ module GeneValidator
|
|
58
62
|
def parse_next(query_id = nil)
|
59
63
|
current_id = @rows.peek['qseqid']
|
60
64
|
return [] if !query_id.nil? && current_id != query_id
|
61
|
-
|
62
|
-
hit_seq = initialise_classes(hits)
|
65
|
+
hit_seq = initialise_classes(current_id)
|
63
66
|
move_to_next_query
|
64
67
|
hit_seq
|
65
68
|
rescue StopIteration
|
66
69
|
return []
|
67
70
|
end
|
68
71
|
|
72
|
+
private
|
73
|
+
|
69
74
|
##
|
70
75
|
#
|
71
|
-
def initialise_classes(
|
76
|
+
def initialise_classes(current_id, tab_results = @tab_results)
|
77
|
+
hits = tab_results.partition { |h| h['qseqid'] == current_id }[0]
|
72
78
|
hit_list = []
|
73
79
|
grouped_hits = hits.group_by { |row| row['sseqid'] }
|
74
80
|
|
@@ -90,7 +96,7 @@ module GeneValidator
|
|
90
96
|
hsps = hits.select { |row| row['sseqid'] == current_query_id }
|
91
97
|
hsps.each do |row|
|
92
98
|
hsp = Hsp.new
|
93
|
-
hsp.init_tabular_attribute(row
|
99
|
+
hsp.init_tabular_attribute(row)
|
94
100
|
hit_seq.hsp_list.push(hsp)
|
95
101
|
end
|
96
102
|
end
|
@@ -0,0 +1,279 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
require 'genevalidator/blast'
|
4
|
+
require 'genevalidator/exceptions'
|
5
|
+
require 'genevalidator/output'
|
6
|
+
require 'genevalidator/pool'
|
7
|
+
require 'genevalidator/sequences'
|
8
|
+
require 'genevalidator/validation_length_cluster'
|
9
|
+
require 'genevalidator/validation_length_rank'
|
10
|
+
require 'genevalidator/validation_blast_reading_frame'
|
11
|
+
require 'genevalidator/validation_gene_merge'
|
12
|
+
require 'genevalidator/validation_duplication'
|
13
|
+
require 'genevalidator/validation_open_reading_frame'
|
14
|
+
require 'genevalidator/validation_alignment'
|
15
|
+
|
16
|
+
# Top level module / namespace.
|
17
|
+
module GeneValidator
|
18
|
+
Pair1 = Struct.new(:x, :y)
|
19
|
+
|
20
|
+
# Class that initalises separate Validate.new() instances for each query.
|
21
|
+
class Validations
|
22
|
+
extend Forwardable
|
23
|
+
def_delegators GeneValidator, :opt, :config, :query_idx
|
24
|
+
def initialize
|
25
|
+
@opt = opt
|
26
|
+
@config = config
|
27
|
+
@query_idx = query_idx
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
#
|
32
|
+
def run_validations(iterator)
|
33
|
+
p = Pool.new(@opt[:num_threads]) if @opt[:num_threads] > 1
|
34
|
+
|
35
|
+
while @config[:idx] + 1 < @query_idx.length
|
36
|
+
prediction = get_info_on_query_sequence
|
37
|
+
@config[:idx] += 1
|
38
|
+
|
39
|
+
blast_hits = parse_next_iteration(iterator, prediction)
|
40
|
+
|
41
|
+
if blast_hits.nil?
|
42
|
+
@config[:idx] -= 1
|
43
|
+
break
|
44
|
+
end
|
45
|
+
|
46
|
+
if @opt[:num_threads] == 1
|
47
|
+
(Validate.new).validate(prediction, blast_hits, @config[:idx])
|
48
|
+
else
|
49
|
+
p.schedule(prediction, blast_hits, @config[:idx]) do |pred, hits, idx|
|
50
|
+
(Validate.new).validate(pred, hits, idx)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
ensure
|
55
|
+
p.shutdown if @opt[:num_threads] > 1
|
56
|
+
end
|
57
|
+
|
58
|
+
##
|
59
|
+
# get info about the query
|
60
|
+
def get_info_on_query_sequence(input_file = @opt[:input_fasta_file],
|
61
|
+
seq_type = @config[:type])
|
62
|
+
start_offset = @query_idx[@config[:idx] + 1] - @query_idx[@config[:idx]]
|
63
|
+
end_offset = @query_idx[@config[:idx]]
|
64
|
+
query = IO.binread(input_file, start_offset, end_offset)
|
65
|
+
parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
|
66
|
+
|
67
|
+
prediction = Sequence.new
|
68
|
+
prediction.definition = parse_query[0].gsub("\n", '')
|
69
|
+
prediction.identifier = prediction.definition.gsub(/ .*/, '')
|
70
|
+
prediction.type = seq_type
|
71
|
+
prediction.raw_sequence = parse_query[1].gsub("\n", '')
|
72
|
+
prediction.length_protein = prediction.raw_sequence.length
|
73
|
+
prediction.length_protein /= 3 if seq_type == :nucleotide
|
74
|
+
prediction
|
75
|
+
end
|
76
|
+
|
77
|
+
def parse_next_iteration(iterator, prediction)
|
78
|
+
iterator.next if @config[:idx] < @config[:start_idx]
|
79
|
+
if @opt[:blast_xml_file]
|
80
|
+
BlastUtils.parse_next(iterator)
|
81
|
+
elsif @opt[:blast_tabular_file]
|
82
|
+
iterator.parse_next(prediction.identifier)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Class that runs the validations (Instatiated for each query)
|
88
|
+
class Validate
|
89
|
+
extend Forwardable
|
90
|
+
def_delegators GeneValidator, :opt, :config, :mutex_array, :overview
|
91
|
+
|
92
|
+
##
|
93
|
+
# Initilizes the object
|
94
|
+
# Params:
|
95
|
+
# +opt+: A hash with the following keys: validations:, blast_tabular_file:,
|
96
|
+
# blast_tabular_options:, blast_xml_file:, db:, raw_sequences:,
|
97
|
+
# num_threads:, fast:}
|
98
|
+
# +start_idx+: number of the sequence from the file to start with
|
99
|
+
# +overall_evaluation+: boolean variable for printing overall evaluation
|
100
|
+
def initialize
|
101
|
+
@opt = opt
|
102
|
+
@config = config
|
103
|
+
@mutex_array = mutex_array
|
104
|
+
@run_output = nil
|
105
|
+
|
106
|
+
@overview = overview
|
107
|
+
end
|
108
|
+
|
109
|
+
##
|
110
|
+
# Validate one query and create validation report
|
111
|
+
# Params:
|
112
|
+
# +prediction+: Sequence object
|
113
|
+
# +hits+: Array of +Sequence+ objects
|
114
|
+
# +current_idx+: the index number of the query
|
115
|
+
def validate(prediction, hits, current_idx)
|
116
|
+
hits = remove_identical_hits(prediction, hits)
|
117
|
+
vals = create_validation_tests(prediction, hits)
|
118
|
+
check_validations(vals)
|
119
|
+
vals.each(&:run)
|
120
|
+
@run_output = Output.new(current_idx, hits.length, prediction.definition)
|
121
|
+
@run_output.validations = vals.map(&:validation_report)
|
122
|
+
check_validations_output(vals)
|
123
|
+
|
124
|
+
compute_scores
|
125
|
+
generate_run_output
|
126
|
+
end
|
127
|
+
|
128
|
+
##
|
129
|
+
# Removes identical hits (100% coverage and >99% identity)
|
130
|
+
# Params:
|
131
|
+
# +prediction+: Sequence object
|
132
|
+
# +hits+: Array of +Sequence+ objects
|
133
|
+
# Output:
|
134
|
+
# new array of hit +Sequence+ objects
|
135
|
+
def remove_identical_hits(prediction, hits)
|
136
|
+
identical_hits = []
|
137
|
+
hits.each do |hit|
|
138
|
+
low_identity = hit.hsp_list.select { |hsp| hsp.pidentity < 99 }
|
139
|
+
no_data = hit.hsp_list.select { |hsp| hsp.pidentity.nil? }
|
140
|
+
low_identity += no_data
|
141
|
+
# check the coverage
|
142
|
+
coverage = Array.new(prediction.length_protein, 0)
|
143
|
+
hit.hsp_list.each do |hsp|
|
144
|
+
match_to = hsp.match_query_to
|
145
|
+
match_from = hsp.match_query_from
|
146
|
+
len = match_to - match_from + 1
|
147
|
+
coverage[match_from - 1..match_to - 1] = Array.new(len, 1)
|
148
|
+
end
|
149
|
+
|
150
|
+
if low_identity.length == 0 && coverage.uniq.length == 1
|
151
|
+
identical_hits.push(hit)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
identical_hits.each { |hit| hits.delete(hit) }
|
156
|
+
hits
|
157
|
+
end
|
158
|
+
|
159
|
+
def create_validation_tests(prediction, hits)
|
160
|
+
val = []
|
161
|
+
val.push LengthClusterValidation.new(prediction, hits)
|
162
|
+
val.push LengthRankValidation.new(prediction, hits)
|
163
|
+
val.push GeneMergeValidation.new(prediction, hits)
|
164
|
+
val.push DuplicationValidation.new(prediction, hits)
|
165
|
+
if @config[:type] == :nucleotide
|
166
|
+
val.push BlastReadingFrameValidation.new(prediction, hits)
|
167
|
+
val.push OpenReadingFrameValidation.new(prediction, hits)
|
168
|
+
end
|
169
|
+
val.push AlignmentValidation.new(prediction, hits)
|
170
|
+
val.select { |v| @opt[:validations].include? v.cli_name.downcase }
|
171
|
+
end
|
172
|
+
|
173
|
+
def check_validations(vals)
|
174
|
+
# check the class type of the elements in the list
|
175
|
+
vals.each { |v| fail ValidationClassError unless v.is_a? ValidationTest }
|
176
|
+
# check alias duplication
|
177
|
+
aliases = vals.map(&:cli_name)
|
178
|
+
fail AliasDuplicationError unless aliases.length == aliases.uniq.length
|
179
|
+
rescue ValidationClassError => e
|
180
|
+
$stderr.puts e
|
181
|
+
exit 1
|
182
|
+
rescue AliasDuplicationError => e
|
183
|
+
$stderr.puts e
|
184
|
+
exit 1
|
185
|
+
end
|
186
|
+
|
187
|
+
def check_validations_output(vals)
|
188
|
+
fail NoValidationError if @run_output.validations.length == 0
|
189
|
+
vals.each do |v|
|
190
|
+
fail ReportClassError unless v.validation_report.is_a? ValidationReport
|
191
|
+
end
|
192
|
+
rescue NoValidationError => e
|
193
|
+
$stderr.puts e
|
194
|
+
exit 1
|
195
|
+
rescue ReportClassError => e
|
196
|
+
$stderr.puts e
|
197
|
+
exit 1
|
198
|
+
end
|
199
|
+
|
200
|
+
def compute_scores
|
201
|
+
validations = @run_output.validations
|
202
|
+
scores = {}
|
203
|
+
scores[:successes] = validations.map { |v| v.result == v.expected }.count(true)
|
204
|
+
scores[:fails] = validations.map { |v| v.validation != :unapplicable && v.validation != :error && v.result != v.expected }.count(true)
|
205
|
+
scores = length_validation_scores(validations, scores)
|
206
|
+
|
207
|
+
@run_output.successes = scores[:successes]
|
208
|
+
@run_output.fails = scores[:fails]
|
209
|
+
total_query = scores[:successes].to_i + scores[:fails]
|
210
|
+
@run_output.overall_score = (scores[:successes] * 100 / total_query).round
|
211
|
+
end
|
212
|
+
|
213
|
+
# Since there are two length validations, it is necessary to adjust the
|
214
|
+
# scores accordingly
|
215
|
+
def length_validation_scores(validations, scores)
|
216
|
+
lcv = validations.select { |v| v.class == LengthClusterValidationOutput }
|
217
|
+
lrv = validations.select { |v| v.class == LengthRankValidationOutput }
|
218
|
+
if lcv.length == 1 && lrv.length == 1
|
219
|
+
score_lcv = (lcv[0].result == lcv[0].expected)
|
220
|
+
score_lrv = (lrv[0].result == lrv[0].expected)
|
221
|
+
if score_lcv == true && score_lrv == true
|
222
|
+
scores[:successes] -= 1 # if both are true: counted as 1 success
|
223
|
+
elsif score_lcv == false && score_lrv == false
|
224
|
+
scores[:fails] -= 1 # if both are false: counted as 1 fail
|
225
|
+
else
|
226
|
+
scores[:successes] -= 0.5
|
227
|
+
scores[:fails] -= 0.5
|
228
|
+
end
|
229
|
+
end
|
230
|
+
scores
|
231
|
+
end
|
232
|
+
|
233
|
+
def generate_run_output
|
234
|
+
@run_output.generate_html
|
235
|
+
@run_output.generate_json
|
236
|
+
@run_output.print_output_console
|
237
|
+
generate_run_overview
|
238
|
+
end
|
239
|
+
|
240
|
+
def generate_run_overview
|
241
|
+
vals = @run_output.validations
|
242
|
+
no_mafft = 0
|
243
|
+
no_internet = 0
|
244
|
+
errors = []
|
245
|
+
vals.each do |v|
|
246
|
+
unless v.errors.nil?
|
247
|
+
no_mafft += v.errors.select { |e| e == NoMafftInstallationError }.length
|
248
|
+
no_internet += v.errors.select { |e| e == NoInternetError }.length
|
249
|
+
end
|
250
|
+
errors.push(v.short_header) if v.validation == :error
|
251
|
+
end
|
252
|
+
|
253
|
+
no_evidence = vals.count { |v| v.result == :unapplicable || v.result == :warning } == vals.length
|
254
|
+
nee = (no_evidence) ? 1 : 0
|
255
|
+
|
256
|
+
good_scores = (@run_output.overall_score >= 75) ? 1 : 0
|
257
|
+
bad_scores = (@run_output.overall_score >= 75) ? 0 : 1
|
258
|
+
|
259
|
+
@mutex_array.synchronize do
|
260
|
+
@overview[:no_queries] += 1
|
261
|
+
@overview[:scores].push(@run_output.overall_score)
|
262
|
+
@overview[:good_scores] += good_scores
|
263
|
+
@overview[:bad_scores] += bad_scores
|
264
|
+
@overview[:nee] += nee
|
265
|
+
@overview[:no_mafft] += no_mafft
|
266
|
+
@overview[:no_internet] += no_internet
|
267
|
+
errors.each { |err| @overview[:map_errors][err] += 1 }
|
268
|
+
|
269
|
+
vals.each do |v|
|
270
|
+
next if v.run_time == 0 || v.run_time.nil?
|
271
|
+
next if v.validation == :unapplicable || v.validation == :error
|
272
|
+
p = Pair1.new(@overview[:run_time][v.short_header].x + v.run_time,
|
273
|
+
@overview[:run_time][v.short_header].y + 1)
|
274
|
+
@overview[:run_time][v.short_header] = p
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end
|
@@ -1,5 +1,9 @@
|
|
1
|
-
require '
|
1
|
+
require 'bio'
|
2
|
+
require 'forwardable'
|
3
|
+
|
2
4
|
require 'genevalidator/exceptions'
|
5
|
+
require 'genevalidator/validation_report'
|
6
|
+
require 'genevalidator/validation_test'
|
3
7
|
|
4
8
|
module GeneValidator
|
5
9
|
##
|
@@ -81,7 +85,8 @@ module GeneValidator
|
|
81
85
|
# This class contains the methods necessary for
|
82
86
|
# validations based on multiple alignment
|
83
87
|
class AlignmentValidation < ValidationTest
|
84
|
-
|
88
|
+
extend Forwardable
|
89
|
+
def_delegators GeneValidator, :opt, :config
|
85
90
|
attr_reader :multiple_alignment
|
86
91
|
attr_reader :raw_seq_file
|
87
92
|
attr_reader :index_file_name
|
@@ -90,32 +95,26 @@ module GeneValidator
|
|
90
95
|
##
|
91
96
|
# Initilizes the object
|
92
97
|
# Params:
|
93
|
-
# +type+: type of the predicted sequence (:nucleotide or :protein)
|
94
98
|
# +prediction+: a +Sequence+ object representing the blast query
|
95
99
|
# +hits+: a vector of +Sequence+ objects (representing blast hits)
|
96
|
-
# +
|
97
|
-
|
98
|
-
# +raw_seq_file+: name of the fasta file with raw sequences
|
99
|
-
# +index_file_name+: name of the fasta index file
|
100
|
-
# +raw_seq_file_load+: String - loaded content of the index file
|
101
|
-
def initialize(type, prediction, hits, filename, raw_seq_file,
|
102
|
-
index_file_name, raw_seq_file_load, db, num_threads)
|
100
|
+
# +plot_path+: name of the fasta file
|
101
|
+
def initialize(prediction, hits)
|
103
102
|
super
|
104
|
-
@short_header = '
|
105
|
-
@
|
103
|
+
@short_header = 'MissingExtraSequences'
|
104
|
+
@cli_name = 'align'
|
105
|
+
@header = 'Missing/Extra Sequences'
|
106
106
|
@description = 'Finds missing and extra sequences in the' \
|
107
107
|
' prediction, based on the multiple alignment of' \
|
108
108
|
' the best hits. Also counts the percentage of' \
|
109
109
|
' the conserved regions that appear in the' \
|
110
110
|
' prediction.'
|
111
|
-
@
|
112
|
-
@
|
113
|
-
@
|
114
|
-
@
|
115
|
-
@db = db
|
111
|
+
@raw_seq_file = opt[:raw_sequences]
|
112
|
+
@index_file_name = config[:raw_seq_file_index]
|
113
|
+
@raw_seq_file_load = config[:raw_seq_file_load]
|
114
|
+
@db = opt[:db]
|
116
115
|
@multiple_alignment = []
|
117
|
-
@
|
118
|
-
@
|
116
|
+
@num_threads = opt[:num_threads]
|
117
|
+
@type = config[:type]
|
119
118
|
end
|
120
119
|
|
121
120
|
##
|
@@ -189,39 +188,32 @@ module GeneValidator
|
|
189
188
|
@description, gaps,
|
190
189
|
extra_seq, consensus)
|
191
190
|
@validation_report.plot_files.push(plot1)
|
192
|
-
@validation_report.
|
191
|
+
@validation_report.run_time = Time.now - start
|
193
192
|
@validation_report
|
194
193
|
|
195
194
|
rescue NotEnoughHitsError
|
196
195
|
@validation_report = ValidationReport.new('Not enough evidence',
|
197
196
|
:warning, @short_header,
|
198
|
-
@header, @description
|
199
|
-
@approach, @explanation,
|
200
|
-
@conclusion)
|
197
|
+
@header, @description)
|
201
198
|
rescue NoMafftInstallationError
|
202
199
|
@validation_report = ValidationReport.new('Mafft error', :error,
|
203
200
|
@short_header, @header,
|
204
|
-
@description
|
205
|
-
@explanation, @conclusion)
|
201
|
+
@description)
|
206
202
|
@validation_report.errors.push NoMafftInstallationError
|
207
203
|
rescue NoInternetError
|
208
204
|
@validation_report = ValidationReport.new('Internet error', :error,
|
209
205
|
@short_header, @header,
|
210
|
-
@description
|
211
|
-
@explanation, @conclusion)
|
206
|
+
@description)
|
212
207
|
@validation_report.errors.push NoInternetError
|
213
208
|
rescue ReadingFrameError
|
214
209
|
@validation_report = ValidationReport.new('Multiple reading frames',
|
215
210
|
:error, @short_header,
|
216
|
-
@header, @description
|
217
|
-
@approach, @explanation,
|
218
|
-
@conclusion)
|
211
|
+
@header, @description)
|
219
212
|
@validation_report.errors.push 'Multiple reading frames Error'
|
220
213
|
rescue Exception
|
221
214
|
@validation_report = ValidationReport.new('Unexpected error', :error,
|
222
215
|
@short_header, @header,
|
223
|
-
@description
|
224
|
-
@explanation, @conclusion)
|
216
|
+
@description)
|
225
217
|
@validation_report.errors.push 'Unexpected Error'
|
226
218
|
end
|
227
219
|
|
@@ -417,9 +409,9 @@ module GeneValidator
|
|
417
409
|
# lines for multiple hits alignment, prediction and statistical model
|
418
410
|
# Params:
|
419
411
|
# +freq+: +String+ residue frequency from the statistical model
|
420
|
-
# +output+:
|
412
|
+
# +output+: plot_path of the json file
|
421
413
|
# +ma+: +String+ array with the multiple alignmened hits and prediction
|
422
|
-
def plot_alignment(freq,
|
414
|
+
def plot_alignment(freq, ma = @multiple_alignment)
|
423
415
|
# get indeces of consensus in the multiple alignment
|
424
416
|
consensus = get_consensus(@multiple_alignment[0..@multiple_alignment.length - 2])
|
425
417
|
consensus_idxs = consensus.split(//).each_index.select { |j| isalpha(consensus[j]) }
|
@@ -438,32 +430,24 @@ module GeneValidator
|
|
438
430
|
|
439
431
|
len = ma[0].length
|
440
432
|
|
441
|
-
f = File.open(output, 'w')
|
442
|
-
f.write((
|
443
433
|
# plot statistical model
|
444
|
-
freq.each_with_index.map { |
|
434
|
+
data = freq.each_with_index.map { |h, j| { 'y' => ma.length, 'start' => j, 'stop' => j + 1, 'color' => 'orange', 'height' => h } } +
|
445
435
|
# hits
|
446
436
|
match_alignment_ranges.each_with_index.map { |ranges, j| ranges.map { |range| { 'y' => ma.length - j - 1, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } } }.flatten +
|
447
|
-
ma[0..ma.length - 2].each_with_index.map { |_seq, j|
|
448
|
-
consensus_ranges.map { |range| { 'y' => j + 1, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } }
|
449
|
-
}.flatten +
|
437
|
+
ma[0..ma.length - 2].each_with_index.map { |_seq, j| consensus_ranges.map { |range| { 'y' => j + 1, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } } }.flatten +
|
450
438
|
# plot prediction
|
451
439
|
[{ 'y' => 0, 'start' => 0, 'stop' => len, 'color' => 'gray', 'height' => -1 }] +
|
452
440
|
query_alignment_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } }.flatten +
|
453
441
|
|
454
442
|
# plot consensus
|
455
|
-
consensus_all_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } }.flatten
|
456
|
-
|
457
|
-
f.close
|
443
|
+
consensus_all_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } }.flatten
|
458
444
|
|
459
445
|
yAxisValues = 'Prediction'
|
460
|
-
(1..ma.length - 1).each
|
461
|
-
yAxisValues << ", hit #{i}"
|
462
|
-
end
|
446
|
+
(1..ma.length - 1).each { |i| yAxisValues << ", hit #{i}" }
|
463
447
|
|
464
448
|
yAxisValues << ', Statistical Model'
|
465
449
|
|
466
|
-
Plot.new(
|
450
|
+
Plot.new(data,
|
467
451
|
:align,
|
468
452
|
'Missing/Extra sequences Validation: Multiple Align. & Statistical model of hits',
|
469
453
|
'Conserved Region, Yellow',
|