genevalidator 1.6.12 → 2.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +30 -1
- data/.ruby-version +1 -0
- data/.travis.yml +13 -12
- data/Gemfile +4 -1
- data/Gemfile.lock +135 -0
- data/README.md +104 -122
- data/Rakefile +377 -5
- data/aux/gv_results.slim +155 -0
- data/aux/html_files/css/gv.compiled.min.css +8 -0
- data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
- data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
- data/aux/{files → html_files}/css/src/style.css +0 -0
- data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
- data/aux/{files → html_files}/img/gene.png +0 -0
- data/aux/html_files/js/gv.compiled.min.js +1 -0
- data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
- data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
- data/aux/{files → html_files}/js/src/plots.js +1 -1
- data/aux/{files → html_files}/js/src/script.js +0 -0
- data/aux/{files → html_files}/json/.gitkeep +0 -0
- data/bin/genevalidator +393 -56
- data/exemplar_data/README.md +60 -0
- data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
- data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
- data/genevalidator.gemspec +35 -20
- data/install.sh +92 -0
- data/lib/genevalidator.rb +171 -56
- data/lib/genevalidator/arg_validation.rb +26 -55
- data/lib/genevalidator/blast.rb +44 -99
- data/lib/genevalidator/clusterization.rb +18 -22
- data/lib/genevalidator/exceptions.rb +17 -17
- data/lib/genevalidator/ext/array.rb +21 -4
- data/lib/genevalidator/get_raw_sequences.rb +32 -31
- data/lib/genevalidator/hsp.rb +31 -2
- data/lib/genevalidator/json_to_gv_results.rb +38 -122
- data/lib/genevalidator/output.rb +158 -172
- data/lib/genevalidator/output_files.rb +134 -0
- data/lib/genevalidator/pool.rb +2 -5
- data/lib/genevalidator/query.rb +1 -1
- data/lib/genevalidator/tabular_parser.rb +8 -29
- data/lib/genevalidator/validation.rb +48 -90
- data/lib/genevalidator/validation_alignment.rb +64 -75
- data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
- data/lib/genevalidator/validation_duplication.rb +85 -84
- data/lib/genevalidator/validation_gene_merge.rb +46 -35
- data/lib/genevalidator/validation_length_cluster.rb +18 -15
- data/lib/genevalidator/validation_length_rank.rb +19 -15
- data/lib/genevalidator/validation_maker_qi.rb +13 -12
- data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
- data/lib/genevalidator/validation_report.rb +1 -1
- data/lib/genevalidator/validation_test.rb +1 -1
- data/lib/genevalidator/version.rb +1 -1
- data/test/overall.rb +1 -1
- data/test/test_all_validations.rb +36 -24
- data/test/test_blast.rb +39 -24
- data/test/test_clusterization_2d.rb +4 -4
- data/test/test_helper.rb +2 -2
- data/test/test_query.rb +16 -20
- data/test/test_validation_open_reading_frame.rb +122 -122
- data/test/test_validations.rb +12 -10
- metadata +94 -79
- data/aux/files/css/genevalidator.compiled.min.css +0 -16
- data/aux/files/js/genevalidator.compiled.min.js +0 -28
- data/aux/json_footer.erb +0 -8
- data/aux/json_header.erb +0 -19
- data/aux/json_query.erb +0 -15
- data/aux/template_footer.erb +0 -8
- data/aux/template_header.erb +0 -19
- data/aux/template_query.erb +0 -14
- data/data/README.md +0 -57
- data/data/mrna_data.fasta.blast_tabular +0 -3567
- data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
- data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
- data/data/mrna_data.fasta.blast_xml +0 -39800
- data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
- data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
- data/data/mrna_data.fasta.json +0 -1
- data/data/protein_data.fasta.blast_tabular +0 -3278
- data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
- data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
- data/data/protein_data.fasta.blast_xml +0 -26228
- data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
- data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
- data/data/protein_data.fasta.json +0 -1
data/lib/genevalidator/pool.rb
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# coding: utf-8
|
|
2
1
|
# From http://burgestrand.se/code/ruby-thread-pool/
|
|
3
2
|
#
|
|
4
3
|
# Copyright © 2012, Kim Burgestrand kim@burgestrand.se
|
|
@@ -21,8 +20,6 @@
|
|
|
21
20
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
21
|
# SOFTWARE.
|
|
23
22
|
|
|
24
|
-
require 'thread'
|
|
25
|
-
|
|
26
23
|
# Class that creates a thread safe pool.
|
|
27
24
|
class Pool
|
|
28
25
|
def initialize(size)
|
|
@@ -58,8 +55,8 @@ if $PROGRAM_NAME == __FILE__
|
|
|
58
55
|
|
|
59
56
|
20.times do |i|
|
|
60
57
|
p.schedule do
|
|
61
|
-
sleep rand(
|
|
62
|
-
|
|
58
|
+
sleep rand(2..5)
|
|
59
|
+
warn "Job #{i} finished by thread #{Thread.current[:id]}\n"
|
|
63
60
|
end
|
|
64
61
|
end
|
|
65
62
|
at_exit { p.shutdown }
|
data/lib/genevalidator/query.rb
CHANGED
|
@@ -5,7 +5,6 @@ require 'genevalidator/exceptions'
|
|
|
5
5
|
require 'genevalidator/hsp'
|
|
6
6
|
require 'genevalidator/query'
|
|
7
7
|
|
|
8
|
-
#
|
|
9
8
|
module GeneValidator
|
|
10
9
|
##
|
|
11
10
|
# This class parses the tabular output of BLAST (outfmt 6 & 7)
|
|
@@ -31,14 +30,9 @@ module GeneValidator
|
|
|
31
30
|
##
|
|
32
31
|
#
|
|
33
32
|
def analayse_tabular_file(filename)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
lines
|
|
37
|
-
headers: @column_names)
|
|
38
|
-
lines.each do |line|
|
|
39
|
-
results << line.to_hash
|
|
40
|
-
end
|
|
41
|
-
results
|
|
33
|
+
lines = CSV.parse(File.read(filename), col_sep: "\t", skip_lines: /^#/,
|
|
34
|
+
headers: @column_names)
|
|
35
|
+
lines.map(&:to_hash)
|
|
42
36
|
end
|
|
43
37
|
|
|
44
38
|
##
|
|
@@ -63,7 +57,7 @@ module GeneValidator
|
|
|
63
57
|
move_to_next_query
|
|
64
58
|
hit_seq
|
|
65
59
|
rescue StopIteration
|
|
66
|
-
|
|
60
|
+
[]
|
|
67
61
|
end
|
|
68
62
|
|
|
69
63
|
private
|
|
@@ -72,29 +66,14 @@ module GeneValidator
|
|
|
72
66
|
#
|
|
73
67
|
def initialise_classes(current_id, tab_results = @tab_results)
|
|
74
68
|
hits = tab_results.partition { |h| h['qseqid'] == current_id }[0]
|
|
75
|
-
hit_list = []
|
|
76
69
|
grouped_hits = hits.group_by { |row| row['sseqid'] }
|
|
77
70
|
|
|
78
|
-
grouped_hits.
|
|
71
|
+
grouped_hits.map do |_query_id, rows|
|
|
79
72
|
hit_seq = Query.new
|
|
80
|
-
hit_seq.init_tabular_attribute(
|
|
81
|
-
|
|
82
|
-
initialise_all_hsps(query_id, hits, hit_seq)
|
|
83
|
-
|
|
73
|
+
hit_seq.init_tabular_attribute(rows[0])
|
|
74
|
+
hit_seq.hsp_list = rows.map { |row| Hsp.new(tabular_input: row) }
|
|
84
75
|
hit_seq.type = :protein
|
|
85
|
-
|
|
86
|
-
end
|
|
87
|
-
hit_list
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
##
|
|
91
|
-
#
|
|
92
|
-
def initialise_all_hsps(current_query_id, hits, hit_seq)
|
|
93
|
-
hsps = hits.select { |row| row['sseqid'] == current_query_id }
|
|
94
|
-
hsps.each do |row|
|
|
95
|
-
hsp = Hsp.new
|
|
96
|
-
hsp.init_tabular_attribute(row)
|
|
97
|
-
hit_seq.hsp_list.push(hsp)
|
|
76
|
+
hit_seq
|
|
98
77
|
end
|
|
99
78
|
end
|
|
100
79
|
end
|
|
@@ -49,11 +49,14 @@ module GeneValidator
|
|
|
49
49
|
break
|
|
50
50
|
end
|
|
51
51
|
|
|
52
|
+
arr_idx = @config[:idx] - 1
|
|
53
|
+
next unless @config[:json_output][arr_idx].nil?
|
|
54
|
+
|
|
52
55
|
if @opt[:num_threads] == 1
|
|
53
|
-
|
|
56
|
+
Validate.new.validate(prediction, blast_hits, @config[:idx])
|
|
54
57
|
else
|
|
55
58
|
p.schedule(prediction, blast_hits, @config[:idx]) do |pred, hits, idx|
|
|
56
|
-
|
|
59
|
+
Validate.new.validate(pred, hits, idx)
|
|
57
60
|
end
|
|
58
61
|
end
|
|
59
62
|
end
|
|
@@ -63,18 +66,16 @@ module GeneValidator
|
|
|
63
66
|
|
|
64
67
|
##
|
|
65
68
|
# get info about the query
|
|
66
|
-
def get_info_on_query_sequence(
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
end_offset = @query_idx[@config[:idx]]
|
|
70
|
-
query = IO.binread(input_file, start_offset, end_offset)
|
|
69
|
+
def get_info_on_query_sequence(seq_type = @config[:type],
|
|
70
|
+
index = @config[:idx])
|
|
71
|
+
query = GeneValidator.extract_input_fasta_sequence(index)
|
|
71
72
|
parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
|
|
72
73
|
|
|
73
74
|
prediction = Query.new
|
|
74
|
-
prediction.definition = parse_query[0].
|
|
75
|
+
prediction.definition = parse_query[0].delete("\n")
|
|
75
76
|
prediction.identifier = prediction.definition.gsub(/ .*/, '')
|
|
76
77
|
prediction.type = seq_type
|
|
77
|
-
prediction.raw_sequence = parse_query[1].
|
|
78
|
+
prediction.raw_sequence = parse_query[1].delete("\n")
|
|
78
79
|
prediction.length_protein = prediction.raw_sequence.length
|
|
79
80
|
prediction.length_protein /= 3 if seq_type == :nucleotide
|
|
80
81
|
prediction
|
|
@@ -82,10 +83,10 @@ module GeneValidator
|
|
|
82
83
|
|
|
83
84
|
# Adds 'maker' to @opt[:validations] if the first definiton in the input
|
|
84
85
|
# fasta file contains MAKER's QI (quality index) score
|
|
85
|
-
def check_if_maker_input?
|
|
86
|
-
query =
|
|
86
|
+
def check_if_maker_input?
|
|
87
|
+
query = GeneValidator.extract_input_fasta_sequence(0)
|
|
87
88
|
parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
|
|
88
|
-
definition = parse_query[0].
|
|
89
|
+
definition = parse_query[0].delete("\n")
|
|
89
90
|
number = '-?\d*\.?\d*'
|
|
90
91
|
qi_match = definition.match(/QI:#{number}\|#{number}\|#{number}\|
|
|
91
92
|
#{number}\|#{number}\|#{number}\|
|
|
@@ -107,8 +108,7 @@ module GeneValidator
|
|
|
107
108
|
# Class that runs the validations (Instatiated for each query)
|
|
108
109
|
class Validate
|
|
109
110
|
extend Forwardable
|
|
110
|
-
def_delegators GeneValidator, :opt, :config, :
|
|
111
|
-
:query_idx
|
|
111
|
+
def_delegators GeneValidator, :opt, :config, :overview, :query_idx
|
|
112
112
|
|
|
113
113
|
##
|
|
114
114
|
# Initilizes the object
|
|
@@ -121,7 +121,6 @@ module GeneValidator
|
|
|
121
121
|
def initialize
|
|
122
122
|
@opt = opt
|
|
123
123
|
@config = config
|
|
124
|
-
@mutex_array = mutex_array
|
|
125
124
|
@run_output = nil
|
|
126
125
|
@overview = overview
|
|
127
126
|
@query_idx = query_idx
|
|
@@ -142,7 +141,7 @@ module GeneValidator
|
|
|
142
141
|
@run_output.validations = vals.map(&:validation_report)
|
|
143
142
|
check_validations_output(vals)
|
|
144
143
|
|
|
145
|
-
|
|
144
|
+
compute_run_score
|
|
146
145
|
generate_run_output
|
|
147
146
|
end
|
|
148
147
|
|
|
@@ -154,27 +153,24 @@ module GeneValidator
|
|
|
154
153
|
# Output:
|
|
155
154
|
# new array of hit +Sequence+ objects
|
|
156
155
|
def remove_identical_hits(prediction, hits)
|
|
157
|
-
|
|
158
|
-
hits.each do |hit|
|
|
156
|
+
hits.delete_if do |hit|
|
|
159
157
|
low_identity = hit.hsp_list.select { |hsp| hsp.pidentity < 99 }
|
|
160
158
|
no_data = hit.hsp_list.select { |hsp| hsp.pidentity.nil? }
|
|
161
159
|
low_identity += no_data
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
hit.hsp_list.each do |hsp|
|
|
165
|
-
match_to = hsp.match_query_to
|
|
166
|
-
match_from = hsp.match_query_from
|
|
167
|
-
len = match_to - match_from + 1
|
|
168
|
-
coverage[match_from - 1..match_to - 1] = Array.new(len, 1)
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
if low_identity.length == 0 && coverage.uniq.length == 1
|
|
172
|
-
identical_hits.push(hit)
|
|
173
|
-
end
|
|
160
|
+
coverage = check_hit_coverage(prediction, hit)
|
|
161
|
+
low_identity.empty? && coverage.uniq.length == 1
|
|
174
162
|
end
|
|
163
|
+
end
|
|
175
164
|
|
|
176
|
-
|
|
177
|
-
|
|
165
|
+
def check_hit_coverage(prediction, hit)
|
|
166
|
+
coverage = Array.new(prediction.length_protein, 0)
|
|
167
|
+
hit.hsp_list.each do |hsp|
|
|
168
|
+
match_to = hsp.match_query_to
|
|
169
|
+
match_from = hsp.match_query_from
|
|
170
|
+
len = match_to - match_from + 1
|
|
171
|
+
coverage[match_from - 1..match_to - 1] = Array.new(len, 1)
|
|
172
|
+
end
|
|
173
|
+
coverage
|
|
178
174
|
end
|
|
179
175
|
|
|
180
176
|
def create_validation_tests(prediction, hits)
|
|
@@ -197,45 +193,48 @@ module GeneValidator
|
|
|
197
193
|
|
|
198
194
|
def check_validations(vals)
|
|
199
195
|
# check the class type of the elements in the list
|
|
200
|
-
vals.each { |v|
|
|
196
|
+
vals.each { |v| raise ValidationClassError unless v.is_a? ValidationTest }
|
|
201
197
|
# check alias duplication
|
|
202
198
|
aliases = vals.map(&:cli_name)
|
|
203
|
-
|
|
199
|
+
raise AliasDuplicationError unless aliases.length == aliases.uniq.length
|
|
204
200
|
rescue ValidationClassError => e
|
|
205
|
-
|
|
201
|
+
warn e
|
|
206
202
|
exit 1
|
|
207
203
|
rescue AliasDuplicationError => e
|
|
208
|
-
|
|
204
|
+
warn e
|
|
209
205
|
exit 1
|
|
210
206
|
end
|
|
211
207
|
|
|
212
208
|
def check_validations_output(vals)
|
|
213
|
-
|
|
209
|
+
raise NoValidationError if @run_output.validations.empty?
|
|
214
210
|
vals.each do |v|
|
|
215
|
-
|
|
211
|
+
raise ReportClassError unless v.validation_report.is_a? ValidationReport
|
|
216
212
|
end
|
|
217
213
|
rescue NoValidationError => e
|
|
218
|
-
|
|
214
|
+
warn e
|
|
219
215
|
exit 1
|
|
220
216
|
rescue ReportClassError => e
|
|
221
|
-
|
|
217
|
+
warn e
|
|
222
218
|
exit 1
|
|
223
219
|
end
|
|
224
220
|
|
|
225
|
-
def
|
|
221
|
+
def compute_run_score
|
|
226
222
|
validations = @run_output.validations
|
|
227
223
|
scores = {}
|
|
228
224
|
scores[:successes] = validations.count { |v| v.result == v.expected }
|
|
229
|
-
scores[:fails] = validations.count
|
|
230
|
-
|
|
225
|
+
scores[:fails] = validations.count do |v|
|
|
226
|
+
v.validation != :unapplicable && v.validation != :error &&
|
|
227
|
+
v.result != v.expected
|
|
228
|
+
end
|
|
229
|
+
scores = length_validation_scores(validations, scores)
|
|
231
230
|
|
|
232
|
-
@run_output.successes
|
|
233
|
-
@run_output.fails
|
|
234
|
-
|
|
235
|
-
if
|
|
231
|
+
@run_output.successes = scores[:successes]
|
|
232
|
+
@run_output.fails = scores[:fails]
|
|
233
|
+
num_total_validations = scores[:successes].to_i + scores[:fails]
|
|
234
|
+
if num_total_validations.zero?
|
|
236
235
|
@run_output.overall_score = 0
|
|
237
236
|
else
|
|
238
|
-
@run_output.overall_score = (scores[:successes] * 90 /
|
|
237
|
+
@run_output.overall_score = (scores[:successes] * 90 / num_total_validations).round
|
|
239
238
|
end
|
|
240
239
|
end
|
|
241
240
|
|
|
@@ -260,49 +259,8 @@ module GeneValidator
|
|
|
260
259
|
end
|
|
261
260
|
|
|
262
261
|
def generate_run_output
|
|
263
|
-
@run_output.generate_html
|
|
264
|
-
@run_output.generate_json
|
|
265
262
|
@run_output.print_output_console
|
|
266
|
-
|
|
267
|
-
end
|
|
268
|
-
|
|
269
|
-
def generate_run_overview
|
|
270
|
-
vals = @run_output.validations
|
|
271
|
-
no_mafft = 0
|
|
272
|
-
no_internet = 0
|
|
273
|
-
errors = []
|
|
274
|
-
vals.each do |v|
|
|
275
|
-
unless v.errors.nil?
|
|
276
|
-
no_mafft += v.errors.count { |e| e == NoMafftInstallationError }
|
|
277
|
-
no_internet += v.errors.count { |e| e == NoInternetError }
|
|
278
|
-
end
|
|
279
|
-
errors.push(v.short_header) if v.validation == :error
|
|
280
|
-
end
|
|
281
|
-
|
|
282
|
-
no_evidence = vals.count { |v| v.result == :unapplicable || v.result == :warning } == vals.length
|
|
283
|
-
nee = (no_evidence) ? 1 : 0
|
|
284
|
-
|
|
285
|
-
good_scores = (@run_output.overall_score >= 75) ? 1 : 0
|
|
286
|
-
bad_scores = (@run_output.overall_score >= 75) ? 0 : 1
|
|
287
|
-
|
|
288
|
-
@mutex_array.synchronize do
|
|
289
|
-
@overview[:no_queries] += 1
|
|
290
|
-
@overview[:scores].push(@run_output.overall_score)
|
|
291
|
-
@overview[:good_scores] += good_scores
|
|
292
|
-
@overview[:bad_scores] += bad_scores
|
|
293
|
-
@overview[:nee] += nee
|
|
294
|
-
@overview[:no_mafft] += no_mafft
|
|
295
|
-
@overview[:no_internet] += no_internet
|
|
296
|
-
errors.each { |err| @overview[:map_errors][err] += 1 }
|
|
297
|
-
|
|
298
|
-
vals.each do |v|
|
|
299
|
-
next if v.run_time == 0 || v.run_time.nil?
|
|
300
|
-
next if v.validation == :unapplicable || v.validation == :error
|
|
301
|
-
p = Pair1.new(@overview[:run_time][v.short_header].x + v.run_time,
|
|
302
|
-
@overview[:run_time][v.short_header].y + 1)
|
|
303
|
-
@overview[:run_time][v.short_header] = p
|
|
304
|
-
end
|
|
305
|
-
end
|
|
263
|
+
@run_output.generate_json
|
|
306
264
|
end
|
|
307
265
|
end
|
|
308
266
|
end
|
|
@@ -17,9 +17,11 @@ module GeneValidator
|
|
|
17
17
|
attr_reader :result
|
|
18
18
|
|
|
19
19
|
def initialize(short_header, header, description, gaps = 0, extra_seq = 0,
|
|
20
|
-
|
|
20
|
+
consensus = 1, threshold = 20, expected = :yes)
|
|
21
21
|
|
|
22
|
-
@short_header
|
|
22
|
+
@short_header = short_header
|
|
23
|
+
@header = header
|
|
24
|
+
@description = description
|
|
23
25
|
@gaps = (gaps * 100).round.to_s + '%'
|
|
24
26
|
@extra_seq = (extra_seq * 100).round.to_s + '%'
|
|
25
27
|
@consensus = (consensus * 100).round.to_s + '%'
|
|
@@ -47,7 +49,9 @@ module GeneValidator
|
|
|
47
49
|
else
|
|
48
50
|
t = 'These results suggest that there may be some problems with' \
|
|
49
51
|
' the query sequence.'
|
|
50
|
-
t1
|
|
52
|
+
t1 = ''
|
|
53
|
+
t2 = ''
|
|
54
|
+
t3 = '' # Create empty string variables
|
|
51
55
|
if (1 - consensus.to_i) > @threshold
|
|
52
56
|
t1 = ' There is low conservation of residues between the' \
|
|
53
57
|
' statistical profile and the query sequence (the cut-off' \
|
|
@@ -114,7 +118,7 @@ module GeneValidator
|
|
|
114
118
|
@raw_seq_file_load = config[:raw_seq_file_load]
|
|
115
119
|
@db = opt[:db]
|
|
116
120
|
@multiple_alignment = []
|
|
117
|
-
@num_threads = opt[:
|
|
121
|
+
@num_threads = opt[:mafft_threads]
|
|
118
122
|
@type = config[:type]
|
|
119
123
|
end
|
|
120
124
|
|
|
@@ -123,32 +127,31 @@ module GeneValidator
|
|
|
123
127
|
# of the first n hits
|
|
124
128
|
# Output:
|
|
125
129
|
# +AlignmentValidationOutput+ object
|
|
126
|
-
def run
|
|
130
|
+
def run
|
|
131
|
+
n = opt[:min_blast_hits] < 10 ? 10 : opt[:min_blast_hits]
|
|
127
132
|
n = 50 if n > 50
|
|
128
133
|
|
|
129
|
-
|
|
130
|
-
|
|
134
|
+
raise NotEnoughHitsError if hits.length < n
|
|
135
|
+
raise unless prediction.is_a?(Query) && hits[0].is_a?(Query)
|
|
131
136
|
|
|
132
137
|
start = Time.new
|
|
133
138
|
# get the first n hits
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
139
|
+
n_hits = [n - 1, @hits.length].min
|
|
140
|
+
less_hits = @hits[0..n_hits]
|
|
137
141
|
# get raw sequences for less_hits
|
|
138
|
-
less_hits.
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
142
|
+
less_hits.delete_if do |hit|
|
|
143
|
+
if hit.raw_sequence.nil?
|
|
144
|
+
hit.raw_sequence = FetchRawSequences.run(hit.identifier,
|
|
145
|
+
hit.accession_no)
|
|
146
|
+
end
|
|
147
|
+
hit.raw_sequence.nil? ? true : false
|
|
143
148
|
end
|
|
144
149
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
fail NoInternetError if less_hits.length == 0
|
|
150
|
+
raise NoInternetError if less_hits.length.zero?
|
|
148
151
|
# in case of nucleotide prediction sequence translate into protein
|
|
149
152
|
# translate with the reading frame of all hits considered for alignment
|
|
150
153
|
reading_frames = less_hits.map(&:reading_frame).uniq
|
|
151
|
-
|
|
154
|
+
raise ReadingFrameError if reading_frames.length != 1
|
|
152
155
|
|
|
153
156
|
if @type == :nucleotide
|
|
154
157
|
s = Bio::Sequence::NA.new(prediction.raw_sequence)
|
|
@@ -157,7 +160,7 @@ module GeneValidator
|
|
|
157
160
|
|
|
158
161
|
# multiple align sequences from less_hits with the prediction
|
|
159
162
|
# the prediction is the last sequence in the vector
|
|
160
|
-
multiple_align_mafft(prediction, less_hits)
|
|
163
|
+
@multiple_alignment = multiple_align_mafft(prediction, less_hits)
|
|
161
164
|
|
|
162
165
|
out = get_sm_pssm(@multiple_alignment[0..@multiple_alignment.length - 2])
|
|
163
166
|
sm = out[0]
|
|
@@ -182,7 +185,6 @@ module GeneValidator
|
|
|
182
185
|
@validation_report.plot_files.push(plot1)
|
|
183
186
|
@validation_report.run_time = Time.now - start
|
|
184
187
|
@validation_report
|
|
185
|
-
|
|
186
188
|
rescue NotEnoughHitsError
|
|
187
189
|
@validation_report = ValidationReport.new('Not enough evidence',
|
|
188
190
|
:warning, @short_header,
|
|
@@ -202,7 +204,7 @@ module GeneValidator
|
|
|
202
204
|
:error, @short_header,
|
|
203
205
|
@header, @description)
|
|
204
206
|
@validation_report.errors.push 'Multiple reading frames Error'
|
|
205
|
-
rescue
|
|
207
|
+
rescue StandardError
|
|
206
208
|
@validation_report = ValidationReport.new('Unexpected error', :error,
|
|
207
209
|
@short_header, @header,
|
|
208
210
|
@description)
|
|
@@ -221,25 +223,25 @@ module GeneValidator
|
|
|
221
223
|
# Output:
|
|
222
224
|
# Array of +String+s, corresponding to the multiple aligned sequences
|
|
223
225
|
# the prediction is the last sequence in the vector
|
|
224
|
-
def multiple_align_mafft(prediction
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
mafft = Bio::MAFFT.new('mafft',
|
|
230
|
-
sequences = hits.map
|
|
226
|
+
def multiple_align_mafft(prediction, hits)
|
|
227
|
+
raise unless prediction.is_a?(Query) && hits[0].is_a?(Query)
|
|
228
|
+
|
|
229
|
+
opt = ['--maxiterate', '1000', '--localpair', '--anysymbol', '--quiet',
|
|
230
|
+
'--thread', @num_threads.to_s]
|
|
231
|
+
mafft = Bio::MAFFT.new('mafft', opt)
|
|
232
|
+
sequences = hits.map do |h|
|
|
233
|
+
# remove the seq id - as MAFFT sometimes has an issue with this
|
|
234
|
+
f = Bio::FastaFormat.new(h.raw_sequence)
|
|
235
|
+
# check if fasta sequence otherwise returne original entry
|
|
236
|
+
f.seq.empty? ? h.raw_sequence : f.seq
|
|
237
|
+
end
|
|
231
238
|
sequences.push(prediction.protein_translation)
|
|
232
239
|
|
|
233
240
|
report = mafft.query_align(sequences)
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
@multiple_alignment.push(s.to_s)
|
|
239
|
-
end
|
|
240
|
-
|
|
241
|
-
@multiple_alignment
|
|
242
|
-
rescue
|
|
241
|
+
alignment = report.alignment.map(&:to_s)
|
|
242
|
+
raise NoMafftInstallationError if alignment.empty?
|
|
243
|
+
alignment
|
|
244
|
+
rescue StandardError
|
|
243
245
|
raise NoMafftInstallationError
|
|
244
246
|
end
|
|
245
247
|
|
|
@@ -252,7 +254,7 @@ module GeneValidator
|
|
|
252
254
|
# +ma+: array of +String+s, corresponding to the multiple aligned sequences
|
|
253
255
|
# Output:
|
|
254
256
|
# +String+ with the consensus regions
|
|
255
|
-
def get_consensus(ma
|
|
257
|
+
def get_consensus(ma)
|
|
256
258
|
align = Bio::Alignment.new(ma)
|
|
257
259
|
align.consensus
|
|
258
260
|
end
|
|
@@ -308,7 +310,7 @@ module GeneValidator
|
|
|
308
310
|
# no of conserved residues among the hits
|
|
309
311
|
no_conserved_residues = consensus.length - consensus.scan(/[\?-]/).length
|
|
310
312
|
|
|
311
|
-
return 1 if no_conserved_residues
|
|
313
|
+
return 1 if no_conserved_residues.zero?
|
|
312
314
|
|
|
313
315
|
# no of conserved residues from the hita that appear in the prediction
|
|
314
316
|
no_conserved_pred = consensus.split(//).each_index.count { |j| consensus[j] != '-' && consensus[j] != '?' && consensus[j] == prediction_raw[j] }
|
|
@@ -327,7 +329,7 @@ module GeneValidator
|
|
|
327
329
|
# +String+ representing the statistical model
|
|
328
330
|
# +Array+ with the maximum frequeny of the majoritary residue for each
|
|
329
331
|
# position
|
|
330
|
-
def get_sm_pssm(ma
|
|
332
|
+
def get_sm_pssm(ma, threshold = 0.7)
|
|
331
333
|
sm = ''
|
|
332
334
|
freq = []
|
|
333
335
|
(0..ma[0].length - 1).each do |i|
|
|
@@ -335,19 +337,10 @@ module GeneValidator
|
|
|
335
337
|
ma.map { |seq| seq[i] }.each { |res| freqs[res] += 1 }
|
|
336
338
|
# get the residue with the highest frequency
|
|
337
339
|
max_freq = freqs.map { |_res, n| n }.max
|
|
338
|
-
residue =
|
|
339
|
-
|
|
340
|
-
if residue == '-'
|
|
341
|
-
freq.push(0)
|
|
342
|
-
else
|
|
343
|
-
freq.push(max_freq / (ma.length + 0.0))
|
|
344
|
-
end
|
|
340
|
+
residue = freqs.map { |res, n| n == max_freq ? res : [] }.flatten[0]
|
|
345
341
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
else
|
|
349
|
-
sm << '?'
|
|
350
|
-
end
|
|
342
|
+
freq << residue == '-' ? 0 : (max_freq / ma.length.to_f)
|
|
343
|
+
sm += (max_freq / ma.length.to_f) >= threshold ? residue : '?'
|
|
351
344
|
end
|
|
352
345
|
[sm, freq]
|
|
353
346
|
end
|
|
@@ -364,16 +357,12 @@ module GeneValidator
|
|
|
364
357
|
gap_starts = seq.to_enum(:scan, /(-\w{1,#{len}}-)/i).map { |_m| $`.size + 1 }
|
|
365
358
|
# remove isolated residues
|
|
366
359
|
gap_starts.each do |i|
|
|
367
|
-
(i..i + len - 1).each
|
|
368
|
-
seq[j] = '-' if isalpha(seq[j])
|
|
369
|
-
end
|
|
360
|
+
(i..i + len - 1).each { |j| seq[j] = '-' if isalpha(seq[j]) }
|
|
370
361
|
end
|
|
371
362
|
# remove isolated gaps
|
|
372
363
|
res_starts = seq.to_enum(:scan, /([?\w]-{1,2}[?\w])/i).map { |_m| $`.size + 1 }
|
|
373
364
|
res_starts.each do |i|
|
|
374
|
-
(i..i + len - 1).each
|
|
375
|
-
seq[j] = '?' if seq[j] == '-'
|
|
376
|
-
end
|
|
365
|
+
(i..i + len - 1).each { |j| seq[j] = '?' if seq[j] == '-' }
|
|
377
366
|
end
|
|
378
367
|
seq
|
|
379
368
|
end
|
|
@@ -390,10 +379,11 @@ module GeneValidator
|
|
|
390
379
|
def array_to_ranges(ar)
|
|
391
380
|
prev = ar[0]
|
|
392
381
|
|
|
393
|
-
ranges = ar.slice_before
|
|
394
|
-
|
|
382
|
+
ranges = ar.slice_before do |e|
|
|
383
|
+
prev2 = prev
|
|
384
|
+
prev = e
|
|
395
385
|
prev2 + 1 != e
|
|
396
|
-
|
|
386
|
+
end.map { |a| a[0]..a[-1] }
|
|
397
387
|
|
|
398
388
|
ranges
|
|
399
389
|
end
|
|
@@ -406,17 +396,16 @@ module GeneValidator
|
|
|
406
396
|
# +ma+: +String+ array with the multiple alignmened hits and prediction
|
|
407
397
|
def plot_alignment(freq, ma = @multiple_alignment)
|
|
408
398
|
# get indeces of consensus in the multiple alignment
|
|
409
|
-
consensus = get_consensus(
|
|
399
|
+
consensus = get_consensus(ma[0..ma.length - 2])
|
|
410
400
|
consensus_idxs = consensus.split(//).each_index.select { |j| isalpha(consensus[j]) }
|
|
411
401
|
consensus_ranges = array_to_ranges(consensus_idxs)
|
|
412
402
|
|
|
413
|
-
consensus_all = get_consensus(
|
|
403
|
+
consensus_all = get_consensus(ma)
|
|
414
404
|
consensus_all_idxs = consensus_all.split(//).each_index.select { |j| isalpha(consensus_all[j]) }
|
|
415
405
|
consensus_all_ranges = array_to_ranges(consensus_all_idxs)
|
|
416
406
|
|
|
417
407
|
match_alignment = ma[0..ma.length - 2].each_with_index.map { |seq, _j| seq.split(//).each_index.select { |j| isalpha(seq[j]) } }
|
|
418
|
-
match_alignment_ranges =
|
|
419
|
-
match_alignment.each { |arr| match_alignment_ranges << array_to_ranges(arr) }
|
|
408
|
+
match_alignment_ranges = match_alignment.map { |e| array_to_ranges(e) }
|
|
420
409
|
|
|
421
410
|
query_alignment = ma[ma.length - 1].split(//).each_index.select { |j| isalpha(ma[ma.length - 1][j]) }
|
|
422
411
|
query_alignment_ranges = array_to_ranges(query_alignment)
|
|
@@ -425,15 +414,15 @@ module GeneValidator
|
|
|
425
414
|
|
|
426
415
|
# plot statistical model
|
|
427
416
|
data = freq.each_with_index.map { |h, j| { 'y' => ma.length, 'start' => j, 'stop' => j + 1, 'color' => 'orange', 'height' => h } } +
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
417
|
+
# hits
|
|
418
|
+
match_alignment_ranges.each_with_index.map { |ranges, j| ranges.map { |range| { 'y' => ma.length - j - 1, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } } }.flatten +
|
|
419
|
+
ma[0..ma.length - 2].each_with_index.map { |_seq, j| consensus_ranges.map { |range| { 'y' => j + 1, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } } }.flatten +
|
|
420
|
+
# plot prediction
|
|
421
|
+
[{ 'y' => 0, 'start' => 0, 'stop' => len, 'color' => 'gray', 'height' => -1 }] +
|
|
422
|
+
query_alignment_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } }.flatten +
|
|
423
|
+
|
|
424
|
+
# plot consensus
|
|
425
|
+
consensus_all_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } }.flatten
|
|
437
426
|
|
|
438
427
|
y_axis_values = 'Prediction'
|
|
439
428
|
(1..ma.length - 1).each { |i| y_axis_values << ", hit #{i}" }
|