genevalidator 1.6.1 → 1.6.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/.travis.yml +2 -0
- data/README.md +78 -30
- data/Rakefile +11 -8
- data/aux/app_template_footer.erb +1 -6
- data/aux/app_template_header.erb +12 -32
- data/aux/files/css/style.css +2 -8
- data/aux/files/js/plots.js +564 -576
- data/aux/files/js/script.js +10 -0
- data/aux/json_footer.erb +8 -0
- data/aux/json_header.erb +19 -0
- data/aux/json_query.erb +14 -0
- data/aux/template_footer.erb +9 -58
- data/aux/template_header.erb +18 -58
- data/aux/template_query.erb +8 -36
- data/bin/genevalidator +45 -32
- data/genevalidator.gemspec +11 -7
- data/lib/genevalidator.rb +75 -455
- data/lib/genevalidator/arg_validation.rb +78 -107
- data/lib/genevalidator/blast.rb +57 -60
- data/lib/genevalidator/clusterization.rb +15 -15
- data/lib/genevalidator/exceptions.rb +32 -5
- data/lib/genevalidator/get_raw_sequences.rb +70 -33
- data/lib/genevalidator/hsp.rb +1 -4
- data/lib/genevalidator/json_to_gv_results.rb +109 -0
- data/lib/genevalidator/output.rb +177 -185
- data/lib/genevalidator/pool.rb +2 -1
- data/lib/genevalidator/sequences.rb +3 -3
- data/lib/genevalidator/tabular_parser.rb +24 -18
- data/lib/genevalidator/validation.rb +279 -0
- data/lib/genevalidator/validation_alignment.rb +31 -47
- data/lib/genevalidator/validation_blast_reading_frame.rb +19 -18
- data/lib/genevalidator/validation_duplication.rb +23 -19
- data/lib/genevalidator/validation_gene_merge.rb +30 -65
- data/lib/genevalidator/validation_length_cluster.rb +14 -53
- data/lib/genevalidator/validation_length_rank.rb +10 -11
- data/lib/genevalidator/validation_open_reading_frame.rb +18 -19
- data/lib/genevalidator/validation_report.rb +2 -5
- data/lib/genevalidator/validation_test.rb +8 -4
- data/lib/genevalidator/version.rb +1 -1
- data/test/test_all_validations.rb +51 -66
- data/test/test_blast.rb +68 -51
- data/test/test_clusterization.rb +1 -1
- data/test/test_clusterization_2d.rb +19 -13
- data/test/test_extended_array_methods.rb +1 -1
- data/test/test_files/all_validations_mrna/mrna.blast_tab6 +1806 -0
- data/test/test_files/all_validations_mrna/mrna.blast_tab7 +1865 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml → mrna.blast_xml} +18642 -1
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.index → mrna.blast_xml.index} +300 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta → mrna.fa} +0 -0
- data/test/test_files/all_validations_mrna/mrna.raw_seq +3970 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.raw_seq.idx → mrna.raw_seq.idx} +901 -1
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_tab → prot.blast_tab6} +416 -0
- data/test/test_files/all_validations_prot/prot.blast_tab7 +2400 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml → prot.blast_xml} +18299 -6723
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.index → prot.blast_xml.index} +408 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta → prot.fa} +0 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq → prot.raw_seq} +2735 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq.idx → prot.raw_seq.idx} +3032 -1808
- data/test/test_sequences.rb +46 -41
- data/test/test_validation_open_reading_frame.rb +318 -202
- data/test/test_validations.rb +48 -32
- metadata +76 -102
- data/doc/AliasDuplicationError.html +0 -134
- data/doc/AlignmentValidation.html +0 -1687
- data/doc/AlignmentValidationOutput.html +0 -659
- data/doc/Blast.html +0 -1905
- data/doc/BlastRFValidationOutput.html +0 -545
- data/doc/BlastReadingFrameValidation.html +0 -370
- data/doc/BlastUtils.html +0 -875
- data/doc/ClasspathError.html +0 -134
- data/doc/Cluster.html +0 -1316
- data/doc/DuplciationValidationOutput.html +0 -564
- data/doc/DuplicationValidation.html +0 -920
- data/doc/DuplicationValidationOutput.html +0 -564
- data/doc/FileNotFoundException.html +0 -134
- data/doc/GeneMergeValidation.html +0 -935
- data/doc/GeneMergeValidationOutput.html +0 -652
- data/doc/HierarchicalClusterization.html +0 -994
- data/doc/Hsp.html +0 -1485
- data/doc/InconsistentTabularFormat.html +0 -135
- data/doc/LengthClusterValidation.html +0 -982
- data/doc/LengthClusterValidationOutput.html +0 -515
- data/doc/LengthRankValidation.html +0 -496
- data/doc/LengthRankValidationOutput.html +0 -517
- data/doc/NoInternetError.html +0 -135
- data/doc/NoMafftInstallationError.html +0 -134
- data/doc/NoPIdentError.html +0 -134
- data/doc/NoValidationError.html +0 -134
- data/doc/NotEnoughHitsError.html +0 -135
- data/doc/ORFValidationOutput.html +0 -593
- data/doc/OpenReadingFrameValidation.html +0 -1107
- data/doc/OtherError.html +0 -123
- data/doc/Output.html +0 -1540
- data/doc/Pair.html +0 -309
- data/doc/PairCluster.html +0 -767
- data/doc/Plot.html +0 -837
- data/doc/QueryError.html +0 -134
- data/doc/ReportClassError.html +0 -135
- data/doc/Sequence.html +0 -1299
- data/doc/SequenceTypeError.html +0 -135
- data/doc/TabularEntry.html +0 -837
- data/doc/TabularParser.html +0 -1104
- data/doc/Validation.html +0 -2147
- data/doc/ValidationClassError.html +0 -134
- data/doc/ValidationOutput.html +0 -460
- data/doc/ValidationReport.html +0 -940
- data/doc/ValidationTest.html +0 -939
- data/doc/_index.html +0 -449
- data/doc/class_list.html +0 -54
- data/doc/css/common.css +0 -1
- data/doc/css/full_list.css +0 -57
- data/doc/css/style.css +0 -338
- data/doc/file.README.html +0 -151
- data/doc/file_list.html +0 -56
- data/doc/frames.html +0 -26
- data/doc/index.html +0 -151
- data/doc/js/app.js +0 -214
- data/doc/js/full_list.js +0 -178
- data/doc/js/jquery.js +0 -4
- data/doc/method_list.html +0 -1505
- data/doc/top-level-namespace.html +0 -112
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.index +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq +0 -4929
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq.idx +0 -1006
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_xml.raw_seq +0 -2075
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.index +0 -1864
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq +0 -42411
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq.idx +0 -3751
@@ -1,24 +1,29 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
require 'genevalidator/exceptions'
|
1
4
|
require 'genevalidator/validation_report'
|
5
|
+
require 'genevalidator/validation_test'
|
6
|
+
|
2
7
|
module GeneValidator
|
3
8
|
##
|
4
9
|
# Class that stores the validation output information
|
5
10
|
class BlastRFValidationOutput < ValidationReport
|
6
|
-
attr_reader :
|
11
|
+
attr_reader :frames
|
7
12
|
attr_reader :msg
|
8
13
|
attr_reader :total_hsp
|
9
14
|
attr_reader :result
|
10
15
|
|
11
|
-
def initialize(short_header, header, description,
|
16
|
+
def initialize(short_header, header, description, frames,
|
12
17
|
expected = :yes)
|
13
18
|
@short_header, @header, @description = short_header, header, description
|
14
|
-
@
|
19
|
+
@frames = frames
|
15
20
|
@expected = expected
|
16
21
|
@result = validation
|
17
22
|
|
18
23
|
@msg = ''
|
19
24
|
@exp_msg = ''
|
20
25
|
@total_hsp = 0
|
21
|
-
@
|
26
|
+
@frames.each do |x, y|
|
22
27
|
@msg << "#{y} HSPs align in frame #{x}; "
|
23
28
|
@exp_msg << "#{y} HSPs align in frame #{x}; "
|
24
29
|
@total_hsp += y.to_i
|
@@ -36,7 +41,7 @@ module GeneValidator
|
|
36
41
|
t = "BLAST identified #{@total_hsp} High-scoring Segment Pairs" \
|
37
42
|
' (HSPs)'
|
38
43
|
if @result == :yes # i.e. if there is only one ORF...
|
39
|
-
frame = @
|
44
|
+
frame = @frames.keys[0].to_s
|
40
45
|
t1 = "; all of these align in frame #{frame}."
|
41
46
|
else
|
42
47
|
t1 = ": #{@exp_msg.gsub(/; $/, '')}."
|
@@ -61,7 +66,7 @@ module GeneValidator
|
|
61
66
|
# chack if there are different reading frames
|
62
67
|
count_p = 0
|
63
68
|
count_n = 0
|
64
|
-
|
69
|
+
frames.each do |x, _y|
|
65
70
|
count_p += 1 if x > 0
|
66
71
|
count_n += 1 if x < 0
|
67
72
|
end
|
@@ -75,7 +80,7 @@ module GeneValidator
|
|
75
80
|
class BlastReadingFrameValidation < ValidationTest
|
76
81
|
def initialize(type, prediction, hits = nil)
|
77
82
|
super
|
78
|
-
@short_header = '
|
83
|
+
@short_header = 'ReadingFrame'
|
79
84
|
@header = 'Reading Frame'
|
80
85
|
@description = 'Check whether there is a single reading frame among' \
|
81
86
|
' BLAST hits. Otherwise there might be a reading frame' \
|
@@ -101,29 +106,25 @@ module GeneValidator
|
|
101
106
|
start = Time.now
|
102
107
|
|
103
108
|
rfs = lst.map { |x| x.hsp_list.map(&:query_reading_frame) }.flatten
|
104
|
-
|
109
|
+
frames = Hash[rfs.group_by { |x| x }.map { |k, vs| [k, vs.length] }]
|
105
110
|
|
106
111
|
# get the main reading frame
|
107
|
-
main_rf =
|
108
|
-
@prediction.nucleotide_rf =
|
112
|
+
main_rf = frames.map { |_k, v| v }.max
|
113
|
+
@prediction.nucleotide_rf = frames.select { |_k, v| v == main_rf }.first.first
|
109
114
|
|
110
115
|
@validation_report = BlastRFValidationOutput.new(@short_header, @header,
|
111
|
-
@description,
|
112
|
-
|
113
|
-
@validation_report.running_time = Time.now - start
|
116
|
+
@description, frames)
|
117
|
+
@validation_report.run_time = Time.now - start
|
114
118
|
@validation_report
|
115
119
|
|
116
120
|
rescue NotEnoughHitsError
|
117
121
|
@validation_report = ValidationReport.new('Not enough evidence',
|
118
122
|
:warning, @short_header,
|
119
|
-
@header, @description
|
120
|
-
@approach, @explanation,
|
121
|
-
@conclusion)
|
123
|
+
@header, @description)
|
122
124
|
rescue Exception
|
123
125
|
@validation_report = ValidationReport.new('Unexpected error', :error,
|
124
126
|
@short_header, @header,
|
125
|
-
@description
|
126
|
-
@explanation, @conclusion)
|
127
|
+
@description)
|
127
128
|
@validation_report.errors.push 'Unexpected Error'
|
128
129
|
end
|
129
130
|
end
|
@@ -1,6 +1,12 @@
|
|
1
|
-
require '
|
1
|
+
require 'bio'
|
2
|
+
require 'forwardable'
|
3
|
+
require 'statsample'
|
4
|
+
|
2
5
|
require 'genevalidator/exceptions'
|
3
6
|
require 'genevalidator/ext/array'
|
7
|
+
require 'genevalidator/validation_report'
|
8
|
+
require 'genevalidator/validation_test'
|
9
|
+
|
4
10
|
module GeneValidator
|
5
11
|
##
|
6
12
|
# Class that stores the validation output information
|
@@ -71,12 +77,14 @@ module GeneValidator
|
|
71
77
|
# This class contains the methods necessary for
|
72
78
|
# finding duplicated subsequences in the predicted gene
|
73
79
|
class DuplicationValidation < ValidationTest
|
80
|
+
extend Forwardable
|
81
|
+
def_delegators GeneValidator, :opt, :config
|
82
|
+
|
74
83
|
attr_reader :raw_seq_file
|
75
84
|
attr_reader :index_file_name
|
76
85
|
attr_reader :raw_seq_file_load
|
77
86
|
|
78
|
-
def initialize(
|
79
|
-
raw_seq_file_load, db, num_threads)
|
87
|
+
def initialize(prediction, hits)
|
80
88
|
super
|
81
89
|
@short_header = 'Duplication'
|
82
90
|
@header = 'Duplication'
|
@@ -84,11 +92,12 @@ module GeneValidator
|
|
84
92
|
' in the predicted gene by counting the hsp' \
|
85
93
|
' residue coverage of the prediction, for each hit.'
|
86
94
|
@cli_name = 'dup'
|
87
|
-
@raw_seq_file =
|
88
|
-
@index_file_name =
|
89
|
-
@raw_seq_file_load = raw_seq_file_load
|
90
|
-
@db = db
|
91
|
-
@num_threads = num_threads
|
95
|
+
@raw_seq_file = opt[:raw_sequences]
|
96
|
+
@index_file_name = config[:raw_seq_file_index]
|
97
|
+
@raw_seq_file_load = config[:raw_seq_file_load]
|
98
|
+
@db = opt[:db]
|
99
|
+
@num_threads = opt[:num_threads]
|
100
|
+
@type = config[:type]
|
92
101
|
end
|
93
102
|
|
94
103
|
def in_range?(ranges, idx)
|
@@ -214,7 +223,7 @@ module GeneValidator
|
|
214
223
|
@header,
|
215
224
|
@description, 1,
|
216
225
|
averages)
|
217
|
-
@validation_report.
|
226
|
+
@validation_report.run_time = Time.now - start
|
218
227
|
return @validation_report
|
219
228
|
end
|
220
229
|
|
@@ -224,31 +233,27 @@ module GeneValidator
|
|
224
233
|
@header,
|
225
234
|
@description, pval,
|
226
235
|
averages)
|
227
|
-
@
|
236
|
+
@run_time = Time.now - start
|
228
237
|
@validation_report
|
229
238
|
|
230
239
|
rescue NotEnoughHitsError
|
231
240
|
@validation_report = ValidationReport.new('Not enough evidence', :warning,
|
232
241
|
@short_header, @header,
|
233
|
-
@description
|
234
|
-
@conclusion)
|
242
|
+
@description)
|
235
243
|
rescue NoMafftInstallationError
|
236
244
|
@validation_report = ValidationReport.new('Mafft error', :error,
|
237
245
|
@short_header, @header,
|
238
|
-
@description
|
239
|
-
@conclusion)
|
246
|
+
@description)
|
240
247
|
@validation_report.errors.push NoMafftInstallationError
|
241
248
|
rescue NoInternetError
|
242
249
|
@validation_report = ValidationReport.new('Internet error', :error,
|
243
250
|
@short_header, @header,
|
244
|
-
@description
|
245
|
-
@conclusion)
|
251
|
+
@description)
|
246
252
|
@validation_report.errors.push NoInternetError
|
247
253
|
rescue Exception
|
248
254
|
@validation_report = ValidationReport.new('Unexpected error', :error,
|
249
255
|
@short_header, @header,
|
250
|
-
@description
|
251
|
-
@conclusion)
|
256
|
+
@description)
|
252
257
|
@validation_report.errors.push 'Unexpected Error'
|
253
258
|
end
|
254
259
|
|
@@ -256,7 +261,6 @@ module GeneValidator
|
|
256
261
|
# wilcox test implementation from statsample ruby gem
|
257
262
|
# many thanks to Claudio for helping us with the implementation!
|
258
263
|
def wilcox_test(averages)
|
259
|
-
require 'statsample'
|
260
264
|
wilcox = Statsample::Test.wilcoxon_signed_rank(averages.to_scale,
|
261
265
|
Array.new(averages.length,
|
262
266
|
1).to_scale)
|
@@ -1,6 +1,11 @@
|
|
1
|
-
require '
|
2
|
-
require '
|
1
|
+
require 'forwardable'
|
2
|
+
require 'statsample'
|
3
|
+
|
4
|
+
require 'genevalidator/exceptions'
|
3
5
|
require 'genevalidator/ext/array'
|
6
|
+
require 'genevalidator/validation_report'
|
7
|
+
require 'genevalidator/validation_test'
|
8
|
+
|
4
9
|
module GeneValidator
|
5
10
|
##
|
6
11
|
# Class that stores the validation output information
|
@@ -83,26 +88,23 @@ module GeneValidator
|
|
83
88
|
# checking whether there is evidence that the
|
84
89
|
# prediction is a merge of multiple genes
|
85
90
|
class GeneMergeValidation < ValidationTest
|
86
|
-
attr_reader :hits
|
87
91
|
attr_reader :prediction
|
88
|
-
attr_reader :
|
92
|
+
attr_reader :hits
|
89
93
|
|
90
94
|
##
|
91
95
|
# Initilizes the object
|
92
96
|
# Params:
|
93
|
-
# +type+: type of the predicted sequence (:nucleotide or :protein)
|
94
97
|
# +prediction+: a +Sequence+ object representing the blast query
|
95
98
|
# +hits+: a vector of +Sequence+ objects (representing blast hits)
|
96
|
-
# +
|
99
|
+
# +plot_path+: name of the input file, used when generatig the plot files
|
97
100
|
# +boundary+: the offset of the hit from which we start analysing the hit
|
98
|
-
def initialize(
|
101
|
+
def initialize(prediction, hits, boundary = 10)
|
99
102
|
super
|
100
|
-
@short_header = '
|
103
|
+
@short_header = 'GeneMerge'
|
101
104
|
@header = 'Gene Merge'
|
102
105
|
@description = 'Check whether BLAST hits make evidence about a merge' \
|
103
106
|
' of two genes that match the predicted gene.'
|
104
107
|
@cli_name = 'merge'
|
105
|
-
@filename = filename
|
106
108
|
@boundary = boundary
|
107
109
|
end
|
108
110
|
|
@@ -158,14 +160,18 @@ module GeneValidator
|
|
158
160
|
@validation_report.plot_files.push(plot1)
|
159
161
|
plot2 = plot_matched_regions
|
160
162
|
@validation_report.plot_files.push(plot2)
|
161
|
-
@validation_report.
|
163
|
+
@validation_report.run_time = Time.now - start
|
162
164
|
@validation_report
|
163
165
|
|
164
166
|
rescue NotEnoughHitsError
|
165
167
|
@validation_report = ValidationReport.new('Not enough evidence', :warning,
|
166
168
|
@short_header, @header,
|
167
|
-
@description
|
168
|
-
|
169
|
+
@description)
|
170
|
+
rescue Exception
|
171
|
+
@validation_report = ValidationReport.new('Unexpected error', :error,
|
172
|
+
@short_header, @header,
|
173
|
+
@description)
|
174
|
+
@validation_report.errors.push 'Unexpected Error'
|
169
175
|
end
|
170
176
|
|
171
177
|
##
|
@@ -175,29 +181,25 @@ module GeneValidator
|
|
175
181
|
# +output+: location where the plot will be saved in jped file format
|
176
182
|
# +hits+: array of Sequence objects
|
177
183
|
# +prediction+: Sequence objects
|
178
|
-
def plot_matched_regions(
|
179
|
-
|
180
|
-
colors = ['orange', 'blue'] ##{colors[i%2]
|
181
|
-
f = File.open(output, 'w')
|
184
|
+
def plot_matched_regions(hits = @hits)
|
182
185
|
no_lines = hits.length
|
183
186
|
|
184
187
|
hits_less = hits[0..[no_lines, hits.length - 1].min]
|
185
188
|
|
186
|
-
|
189
|
+
data = hits_less.each_with_index.map { |hit, i|
|
187
190
|
{ 'y' => i,
|
188
191
|
'start' => hit.hsp_list.map(&:match_query_from).min,
|
189
192
|
'stop' => hit.hsp_list.map(&:match_query_to).max,
|
190
|
-
'color'=>'black',
|
191
|
-
'dotted'=>'true'}}.flatten +
|
193
|
+
'color' =>'black',
|
194
|
+
'dotted' =>'true'}}.flatten +
|
192
195
|
hits_less.each_with_index.map { |hit, i|
|
193
196
|
hit.hsp_list.map { |hsp|
|
194
197
|
{ 'y' => i,
|
195
198
|
'start' => hsp.match_query_from,
|
196
199
|
'stop' => hsp.match_query_to,
|
197
|
-
'color' => 'orange'} } }.flatten
|
198
|
-
f.close
|
200
|
+
'color' => 'orange'} } }.flatten
|
199
201
|
|
200
|
-
Plot.new(
|
202
|
+
Plot.new(data,
|
201
203
|
:lines,
|
202
204
|
'Gene Merge Validation: Query coord covered by blast hit (1 line/hit)',
|
203
205
|
'',
|
@@ -214,60 +216,24 @@ module GeneValidator
|
|
214
216
|
# +y_intercept+: the ecuation of the line is y= slope*x + y_intercept
|
215
217
|
# +output+: location where the plot will be saved in jped file format
|
216
218
|
# +hits+: array of Sequence objects
|
217
|
-
def plot_2d_start_from(slope = nil, y_intercept = nil,
|
218
|
-
output = "#{filename}_match_2d.json", hits = @hits)
|
219
|
+
def plot_2d_start_from(slope = nil, y_intercept = nil, hits = @hits)
|
219
220
|
pairs = hits.map do |hit|
|
220
221
|
Pair.new(hit.hsp_list.map(&:match_query_from).min,
|
221
222
|
hit.hsp_list.map(&:match_query_to).max)
|
222
223
|
end
|
223
224
|
|
224
|
-
|
225
|
-
yy = pairs.map(&:y)
|
226
|
-
|
227
|
-
freq_x = xx.inject(Hash.new(0)) { |h, v| h[v] += 1; h }
|
228
|
-
filename_x = "#{filename}_merge_x.json"
|
229
|
-
f = File.open(filename_x, 'w')
|
230
|
-
f.write([freq_x.collect { |k,v|
|
231
|
-
{ 'key' => k, 'value' => v, 'main' => (1==2) }
|
232
|
-
}].to_json)
|
233
|
-
f.close
|
234
|
-
plot3 = Plot.new(filename_x.scan(%r{([^/]+)$})[0][0],
|
235
|
-
:simplebars,
|
236
|
-
'[Gene Merge] X projection',
|
237
|
-
'',
|
238
|
-
'x projection',
|
239
|
-
'number of sequences')
|
240
|
-
# @validation_report.plot_files.push(plot3)
|
241
|
-
|
242
|
-
freq_y = yy.inject(Hash.new(0)) { |h,v| h[v] += 1; h }
|
243
|
-
filename_y = "#{filename}_merge_y.json"
|
244
|
-
f = File.open(filename_y, 'w')
|
245
|
-
f.write([freq_y.collect { |k, v|
|
246
|
-
{ 'key' => k, 'value' => v, 'main' => (1 == 2) }
|
247
|
-
}].to_json)
|
248
|
-
f.close
|
249
|
-
plot4 = Plot.new(filename_y.scan(%r{([^/]+)$})[0][0],
|
250
|
-
:simplebars,
|
251
|
-
'[Gene Merge] Y projection',
|
252
|
-
'',
|
253
|
-
'y projection',
|
254
|
-
'number of sequences')
|
255
|
-
# @validation_report.plot_files.push(plot4)
|
256
|
-
|
257
|
-
f = File.open(output, 'w')
|
258
|
-
f.write(hits.map { |hit| {'x' => hit.hsp_list.map(&:match_query_from).min,
|
225
|
+
data = hits.map { |hit| { 'x' => hit.hsp_list.map(&:match_query_from).min,
|
259
226
|
'y' => hit.hsp_list.map(&:match_query_to).max,
|
260
|
-
'color' => 'red'}}
|
261
|
-
f.close
|
227
|
+
'color' => 'red'}}
|
262
228
|
|
263
|
-
Plot.new(
|
229
|
+
Plot.new(data,
|
264
230
|
:scatter,
|
265
231
|
'Gene Merge Validation: Start/end of matching hit coord. on query (1 point/hit)',
|
266
232
|
'',
|
267
233
|
'Start Offset (most left hsp)',
|
268
234
|
'End Offset (most right hsp)',
|
269
|
-
y_intercept,
|
270
|
-
slope)
|
235
|
+
y_intercept.to_s,
|
236
|
+
slope.to_s)
|
271
237
|
end
|
272
238
|
|
273
239
|
##
|
@@ -314,7 +280,6 @@ module GeneValidator
|
|
314
280
|
# Output:
|
315
281
|
# The ecuation of the regression line: [y slope]
|
316
282
|
def slope_statsample(xx, yy)
|
317
|
-
require 'statsample'
|
318
283
|
sr = Statsample::Regression.simple(xx.to_scale, yy.to_scale)
|
319
284
|
[sr.a, sr.b]
|
320
285
|
end
|
@@ -1,8 +1,10 @@
|
|
1
|
-
require '
|
1
|
+
require 'forwardable'
|
2
|
+
|
2
3
|
require 'genevalidator/clusterization'
|
4
|
+
require 'genevalidator/exceptions'
|
3
5
|
require 'genevalidator/validation_report'
|
4
6
|
require 'genevalidator/validation_test'
|
5
|
-
|
7
|
+
|
6
8
|
module GeneValidator
|
7
9
|
##
|
8
10
|
# Class that stores the validation output information
|
@@ -65,7 +67,6 @@ module GeneValidator
|
|
65
67
|
# This class contains the methods necessary for
|
66
68
|
# length validation by hit length clusterization
|
67
69
|
class LengthClusterValidation < ValidationTest
|
68
|
-
attr_reader :filename
|
69
70
|
attr_reader :clusters
|
70
71
|
attr_reader :max_density_cluster
|
71
72
|
|
@@ -76,9 +77,8 @@ module GeneValidator
|
|
76
77
|
# +prediction+: a +Sequence+ object representing the blast query
|
77
78
|
# +hits+: a vector of +Sequence+ objects (representing blast hits)
|
78
79
|
# +dilename+: +String+ with the name of the fasta file
|
79
|
-
def initialize(
|
80
|
+
def initialize(prediction, hits)
|
80
81
|
super
|
81
|
-
@filename = filename
|
82
82
|
@short_header = 'LengthCluster'
|
83
83
|
@header = 'Length Cluster'
|
84
84
|
@description = 'Check whether the prediction length fits most of the' \
|
@@ -117,20 +117,18 @@ module GeneValidator
|
|
117
117
|
plot1 = plot_histo_clusters
|
118
118
|
@validation_report.plot_files.push(plot1)
|
119
119
|
|
120
|
-
@validation_report.
|
120
|
+
@validation_report.run_time = Time.now - start
|
121
121
|
|
122
122
|
@validation_report
|
123
123
|
|
124
124
|
rescue NotEnoughHitsError
|
125
125
|
@validation_report = ValidationReport.new('Not enough evidence', :warning,
|
126
126
|
@short_header, @header,
|
127
|
-
@description
|
128
|
-
@explanation, @conclusion)
|
127
|
+
@description)
|
129
128
|
rescue Exception
|
130
129
|
@validation_report = ValidationReport.new('Unexpected error', :error,
|
131
130
|
@short_header, @header,
|
132
|
-
@description
|
133
|
-
@explanation, @conclusion)
|
131
|
+
@description)
|
134
132
|
@validation_report.errors.push 'Unexpected Error'
|
135
133
|
end
|
136
134
|
|
@@ -175,25 +173,24 @@ module GeneValidator
|
|
175
173
|
##
|
176
174
|
# Generates a json file containing data used for plotting the histogram
|
177
175
|
# of the length distribution given a lust of Cluster objects
|
178
|
-
# +output+:
|
176
|
+
# +output+: plot_path where to save the graph
|
179
177
|
# +clusters+: array of +Cluster+ objects
|
180
178
|
# +max_density_cluster+: index of the most dense cluster
|
181
179
|
# +prediction+: +Sequence+ object
|
182
180
|
# Output:
|
183
181
|
# +Plot+ object
|
184
|
-
def plot_histo_clusters(output = "#{@
|
182
|
+
def plot_histo_clusters(output = "#{@plot_path}_len_clusters.json",
|
185
183
|
clusters = @clusters,
|
186
184
|
max_density_cluster = @max_density_cluster,
|
187
185
|
prediction = @prediction)
|
188
186
|
|
189
|
-
|
190
|
-
f.write(clusters.each_with_index.map { |cluster, i|
|
187
|
+
data = clusters.each_with_index.map { |cluster, i|
|
191
188
|
cluster.lengths.collect { |k, v|
|
192
189
|
{ 'key' => k, 'value' => v, 'main' => (i == max_density_cluster) }
|
193
190
|
}
|
194
|
-
}
|
195
|
-
|
196
|
-
Plot.new(
|
191
|
+
}
|
192
|
+
|
193
|
+
Plot.new(data,
|
197
194
|
:bars,
|
198
195
|
'Length Cluster Validation: Distribution of BLAST hit lengths',
|
199
196
|
'Query Sequence, black;Most Dense Cluster,red;Other Hits, blue',
|
@@ -201,41 +198,5 @@ module GeneValidator
|
|
201
198
|
'Number of Sequences',
|
202
199
|
prediction.length_protein)
|
203
200
|
end
|
204
|
-
|
205
|
-
##
|
206
|
-
# Generates a json file cotaining data used for plotting
|
207
|
-
# lines corresponding to the start and end hit offsets
|
208
|
-
# Params:
|
209
|
-
# +output+: filename where to save the graph
|
210
|
-
# +hits+: array of Sequence objects
|
211
|
-
# Output:
|
212
|
-
# +Plot+ object
|
213
|
-
def plot_len_clusters(output = "#{@filename}_len.json", _hits = @hits)
|
214
|
-
f = File.open(output, 'w')
|
215
|
-
lst = @hits.sort { |a, b| a.length_protein <=> b.length_protein }
|
216
|
-
|
217
|
-
no_lines = 100
|
218
|
-
|
219
|
-
lst_less = lst[0..[no_lines, lst.length - 1].min]
|
220
|
-
|
221
|
-
f.write((lst_less.each_with_index.map { |hit, i|
|
222
|
-
{ 'y' => i, 'start' => 0, 'stop' => hit.length_protein,
|
223
|
-
'color' => 'gray' }
|
224
|
-
} + lst_less.each_with_index.map { |hit, i|
|
225
|
-
hit.hsp_list.map { |hsp|
|
226
|
-
{ 'y' => i, 'start' => hsp.hit_from, 'stop' => hsp.hit_to,
|
227
|
-
'color' => 'red' }
|
228
|
-
}
|
229
|
-
}.flatten).to_json)
|
230
|
-
|
231
|
-
f.close
|
232
|
-
Plot.new(output.scan(%r{([^/]+)$})[0][0],
|
233
|
-
:lines,
|
234
|
-
'[Length Cluster] Matched regions in hits',
|
235
|
-
'hit, gray;high-scoring segment pairs (hsp), red',
|
236
|
-
'offset in the hit',
|
237
|
-
'number of the hit',
|
238
|
-
lst_less.length)
|
239
|
-
end
|
240
201
|
end
|
241
202
|
end
|