genevalidator 1.6.1 → 1.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/.travis.yml +2 -0
- data/README.md +78 -30
- data/Rakefile +11 -8
- data/aux/app_template_footer.erb +1 -6
- data/aux/app_template_header.erb +12 -32
- data/aux/files/css/style.css +2 -8
- data/aux/files/js/plots.js +564 -576
- data/aux/files/js/script.js +10 -0
- data/aux/json_footer.erb +8 -0
- data/aux/json_header.erb +19 -0
- data/aux/json_query.erb +14 -0
- data/aux/template_footer.erb +9 -58
- data/aux/template_header.erb +18 -58
- data/aux/template_query.erb +8 -36
- data/bin/genevalidator +45 -32
- data/genevalidator.gemspec +11 -7
- data/lib/genevalidator.rb +75 -455
- data/lib/genevalidator/arg_validation.rb +78 -107
- data/lib/genevalidator/blast.rb +57 -60
- data/lib/genevalidator/clusterization.rb +15 -15
- data/lib/genevalidator/exceptions.rb +32 -5
- data/lib/genevalidator/get_raw_sequences.rb +70 -33
- data/lib/genevalidator/hsp.rb +1 -4
- data/lib/genevalidator/json_to_gv_results.rb +109 -0
- data/lib/genevalidator/output.rb +177 -185
- data/lib/genevalidator/pool.rb +2 -1
- data/lib/genevalidator/sequences.rb +3 -3
- data/lib/genevalidator/tabular_parser.rb +24 -18
- data/lib/genevalidator/validation.rb +279 -0
- data/lib/genevalidator/validation_alignment.rb +31 -47
- data/lib/genevalidator/validation_blast_reading_frame.rb +19 -18
- data/lib/genevalidator/validation_duplication.rb +23 -19
- data/lib/genevalidator/validation_gene_merge.rb +30 -65
- data/lib/genevalidator/validation_length_cluster.rb +14 -53
- data/lib/genevalidator/validation_length_rank.rb +10 -11
- data/lib/genevalidator/validation_open_reading_frame.rb +18 -19
- data/lib/genevalidator/validation_report.rb +2 -5
- data/lib/genevalidator/validation_test.rb +8 -4
- data/lib/genevalidator/version.rb +1 -1
- data/test/test_all_validations.rb +51 -66
- data/test/test_blast.rb +68 -51
- data/test/test_clusterization.rb +1 -1
- data/test/test_clusterization_2d.rb +19 -13
- data/test/test_extended_array_methods.rb +1 -1
- data/test/test_files/all_validations_mrna/mrna.blast_tab6 +1806 -0
- data/test/test_files/all_validations_mrna/mrna.blast_tab7 +1865 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml → mrna.blast_xml} +18642 -1
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.index → mrna.blast_xml.index} +300 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta → mrna.fa} +0 -0
- data/test/test_files/all_validations_mrna/mrna.raw_seq +3970 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.raw_seq.idx → mrna.raw_seq.idx} +901 -1
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_tab → prot.blast_tab6} +416 -0
- data/test/test_files/all_validations_prot/prot.blast_tab7 +2400 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml → prot.blast_xml} +18299 -6723
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.index → prot.blast_xml.index} +408 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta → prot.fa} +0 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq → prot.raw_seq} +2735 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq.idx → prot.raw_seq.idx} +3032 -1808
- data/test/test_sequences.rb +46 -41
- data/test/test_validation_open_reading_frame.rb +318 -202
- data/test/test_validations.rb +48 -32
- metadata +76 -102
- data/doc/AliasDuplicationError.html +0 -134
- data/doc/AlignmentValidation.html +0 -1687
- data/doc/AlignmentValidationOutput.html +0 -659
- data/doc/Blast.html +0 -1905
- data/doc/BlastRFValidationOutput.html +0 -545
- data/doc/BlastReadingFrameValidation.html +0 -370
- data/doc/BlastUtils.html +0 -875
- data/doc/ClasspathError.html +0 -134
- data/doc/Cluster.html +0 -1316
- data/doc/DuplciationValidationOutput.html +0 -564
- data/doc/DuplicationValidation.html +0 -920
- data/doc/DuplicationValidationOutput.html +0 -564
- data/doc/FileNotFoundException.html +0 -134
- data/doc/GeneMergeValidation.html +0 -935
- data/doc/GeneMergeValidationOutput.html +0 -652
- data/doc/HierarchicalClusterization.html +0 -994
- data/doc/Hsp.html +0 -1485
- data/doc/InconsistentTabularFormat.html +0 -135
- data/doc/LengthClusterValidation.html +0 -982
- data/doc/LengthClusterValidationOutput.html +0 -515
- data/doc/LengthRankValidation.html +0 -496
- data/doc/LengthRankValidationOutput.html +0 -517
- data/doc/NoInternetError.html +0 -135
- data/doc/NoMafftInstallationError.html +0 -134
- data/doc/NoPIdentError.html +0 -134
- data/doc/NoValidationError.html +0 -134
- data/doc/NotEnoughHitsError.html +0 -135
- data/doc/ORFValidationOutput.html +0 -593
- data/doc/OpenReadingFrameValidation.html +0 -1107
- data/doc/OtherError.html +0 -123
- data/doc/Output.html +0 -1540
- data/doc/Pair.html +0 -309
- data/doc/PairCluster.html +0 -767
- data/doc/Plot.html +0 -837
- data/doc/QueryError.html +0 -134
- data/doc/ReportClassError.html +0 -135
- data/doc/Sequence.html +0 -1299
- data/doc/SequenceTypeError.html +0 -135
- data/doc/TabularEntry.html +0 -837
- data/doc/TabularParser.html +0 -1104
- data/doc/Validation.html +0 -2147
- data/doc/ValidationClassError.html +0 -134
- data/doc/ValidationOutput.html +0 -460
- data/doc/ValidationReport.html +0 -940
- data/doc/ValidationTest.html +0 -939
- data/doc/_index.html +0 -449
- data/doc/class_list.html +0 -54
- data/doc/css/common.css +0 -1
- data/doc/css/full_list.css +0 -57
- data/doc/css/style.css +0 -338
- data/doc/file.README.html +0 -151
- data/doc/file_list.html +0 -56
- data/doc/frames.html +0 -26
- data/doc/index.html +0 -151
- data/doc/js/app.js +0 -214
- data/doc/js/full_list.js +0 -178
- data/doc/js/jquery.js +0 -4
- data/doc/method_list.html +0 -1505
- data/doc/top-level-namespace.html +0 -112
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.index +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq +0 -4929
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq.idx +0 -1006
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_xml.raw_seq +0 -2075
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.index +0 -1864
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq +0 -42411
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq.idx +0 -3751
@@ -1,24 +1,29 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
require 'genevalidator/exceptions'
|
1
4
|
require 'genevalidator/validation_report'
|
5
|
+
require 'genevalidator/validation_test'
|
6
|
+
|
2
7
|
module GeneValidator
|
3
8
|
##
|
4
9
|
# Class that stores the validation output information
|
5
10
|
class BlastRFValidationOutput < ValidationReport
|
6
|
-
attr_reader :
|
11
|
+
attr_reader :frames
|
7
12
|
attr_reader :msg
|
8
13
|
attr_reader :total_hsp
|
9
14
|
attr_reader :result
|
10
15
|
|
11
|
-
def initialize(short_header, header, description,
|
16
|
+
def initialize(short_header, header, description, frames,
|
12
17
|
expected = :yes)
|
13
18
|
@short_header, @header, @description = short_header, header, description
|
14
|
-
@
|
19
|
+
@frames = frames
|
15
20
|
@expected = expected
|
16
21
|
@result = validation
|
17
22
|
|
18
23
|
@msg = ''
|
19
24
|
@exp_msg = ''
|
20
25
|
@total_hsp = 0
|
21
|
-
@
|
26
|
+
@frames.each do |x, y|
|
22
27
|
@msg << "#{y} HSPs align in frame #{x}; "
|
23
28
|
@exp_msg << "#{y} HSPs align in frame #{x}; "
|
24
29
|
@total_hsp += y.to_i
|
@@ -36,7 +41,7 @@ module GeneValidator
|
|
36
41
|
t = "BLAST identified #{@total_hsp} High-scoring Segment Pairs" \
|
37
42
|
' (HSPs)'
|
38
43
|
if @result == :yes # i.e. if there is only one ORF...
|
39
|
-
frame = @
|
44
|
+
frame = @frames.keys[0].to_s
|
40
45
|
t1 = "; all of these align in frame #{frame}."
|
41
46
|
else
|
42
47
|
t1 = ": #{@exp_msg.gsub(/; $/, '')}."
|
@@ -61,7 +66,7 @@ module GeneValidator
|
|
61
66
|
# chack if there are different reading frames
|
62
67
|
count_p = 0
|
63
68
|
count_n = 0
|
64
|
-
|
69
|
+
frames.each do |x, _y|
|
65
70
|
count_p += 1 if x > 0
|
66
71
|
count_n += 1 if x < 0
|
67
72
|
end
|
@@ -75,7 +80,7 @@ module GeneValidator
|
|
75
80
|
class BlastReadingFrameValidation < ValidationTest
|
76
81
|
def initialize(type, prediction, hits = nil)
|
77
82
|
super
|
78
|
-
@short_header = '
|
83
|
+
@short_header = 'ReadingFrame'
|
79
84
|
@header = 'Reading Frame'
|
80
85
|
@description = 'Check whether there is a single reading frame among' \
|
81
86
|
' BLAST hits. Otherwise there might be a reading frame' \
|
@@ -101,29 +106,25 @@ module GeneValidator
|
|
101
106
|
start = Time.now
|
102
107
|
|
103
108
|
rfs = lst.map { |x| x.hsp_list.map(&:query_reading_frame) }.flatten
|
104
|
-
|
109
|
+
frames = Hash[rfs.group_by { |x| x }.map { |k, vs| [k, vs.length] }]
|
105
110
|
|
106
111
|
# get the main reading frame
|
107
|
-
main_rf =
|
108
|
-
@prediction.nucleotide_rf =
|
112
|
+
main_rf = frames.map { |_k, v| v }.max
|
113
|
+
@prediction.nucleotide_rf = frames.select { |_k, v| v == main_rf }.first.first
|
109
114
|
|
110
115
|
@validation_report = BlastRFValidationOutput.new(@short_header, @header,
|
111
|
-
@description,
|
112
|
-
|
113
|
-
@validation_report.running_time = Time.now - start
|
116
|
+
@description, frames)
|
117
|
+
@validation_report.run_time = Time.now - start
|
114
118
|
@validation_report
|
115
119
|
|
116
120
|
rescue NotEnoughHitsError
|
117
121
|
@validation_report = ValidationReport.new('Not enough evidence',
|
118
122
|
:warning, @short_header,
|
119
|
-
@header, @description
|
120
|
-
@approach, @explanation,
|
121
|
-
@conclusion)
|
123
|
+
@header, @description)
|
122
124
|
rescue Exception
|
123
125
|
@validation_report = ValidationReport.new('Unexpected error', :error,
|
124
126
|
@short_header, @header,
|
125
|
-
@description
|
126
|
-
@explanation, @conclusion)
|
127
|
+
@description)
|
127
128
|
@validation_report.errors.push 'Unexpected Error'
|
128
129
|
end
|
129
130
|
end
|
@@ -1,6 +1,12 @@
|
|
1
|
-
require '
|
1
|
+
require 'bio'
|
2
|
+
require 'forwardable'
|
3
|
+
require 'statsample'
|
4
|
+
|
2
5
|
require 'genevalidator/exceptions'
|
3
6
|
require 'genevalidator/ext/array'
|
7
|
+
require 'genevalidator/validation_report'
|
8
|
+
require 'genevalidator/validation_test'
|
9
|
+
|
4
10
|
module GeneValidator
|
5
11
|
##
|
6
12
|
# Class that stores the validation output information
|
@@ -71,12 +77,14 @@ module GeneValidator
|
|
71
77
|
# This class contains the methods necessary for
|
72
78
|
# finding duplicated subsequences in the predicted gene
|
73
79
|
class DuplicationValidation < ValidationTest
|
80
|
+
extend Forwardable
|
81
|
+
def_delegators GeneValidator, :opt, :config
|
82
|
+
|
74
83
|
attr_reader :raw_seq_file
|
75
84
|
attr_reader :index_file_name
|
76
85
|
attr_reader :raw_seq_file_load
|
77
86
|
|
78
|
-
def initialize(
|
79
|
-
raw_seq_file_load, db, num_threads)
|
87
|
+
def initialize(prediction, hits)
|
80
88
|
super
|
81
89
|
@short_header = 'Duplication'
|
82
90
|
@header = 'Duplication'
|
@@ -84,11 +92,12 @@ module GeneValidator
|
|
84
92
|
' in the predicted gene by counting the hsp' \
|
85
93
|
' residue coverage of the prediction, for each hit.'
|
86
94
|
@cli_name = 'dup'
|
87
|
-
@raw_seq_file =
|
88
|
-
@index_file_name =
|
89
|
-
@raw_seq_file_load = raw_seq_file_load
|
90
|
-
@db = db
|
91
|
-
@num_threads = num_threads
|
95
|
+
@raw_seq_file = opt[:raw_sequences]
|
96
|
+
@index_file_name = config[:raw_seq_file_index]
|
97
|
+
@raw_seq_file_load = config[:raw_seq_file_load]
|
98
|
+
@db = opt[:db]
|
99
|
+
@num_threads = opt[:num_threads]
|
100
|
+
@type = config[:type]
|
92
101
|
end
|
93
102
|
|
94
103
|
def in_range?(ranges, idx)
|
@@ -214,7 +223,7 @@ module GeneValidator
|
|
214
223
|
@header,
|
215
224
|
@description, 1,
|
216
225
|
averages)
|
217
|
-
@validation_report.
|
226
|
+
@validation_report.run_time = Time.now - start
|
218
227
|
return @validation_report
|
219
228
|
end
|
220
229
|
|
@@ -224,31 +233,27 @@ module GeneValidator
|
|
224
233
|
@header,
|
225
234
|
@description, pval,
|
226
235
|
averages)
|
227
|
-
@
|
236
|
+
@run_time = Time.now - start
|
228
237
|
@validation_report
|
229
238
|
|
230
239
|
rescue NotEnoughHitsError
|
231
240
|
@validation_report = ValidationReport.new('Not enough evidence', :warning,
|
232
241
|
@short_header, @header,
|
233
|
-
@description
|
234
|
-
@conclusion)
|
242
|
+
@description)
|
235
243
|
rescue NoMafftInstallationError
|
236
244
|
@validation_report = ValidationReport.new('Mafft error', :error,
|
237
245
|
@short_header, @header,
|
238
|
-
@description
|
239
|
-
@conclusion)
|
246
|
+
@description)
|
240
247
|
@validation_report.errors.push NoMafftInstallationError
|
241
248
|
rescue NoInternetError
|
242
249
|
@validation_report = ValidationReport.new('Internet error', :error,
|
243
250
|
@short_header, @header,
|
244
|
-
@description
|
245
|
-
@conclusion)
|
251
|
+
@description)
|
246
252
|
@validation_report.errors.push NoInternetError
|
247
253
|
rescue Exception
|
248
254
|
@validation_report = ValidationReport.new('Unexpected error', :error,
|
249
255
|
@short_header, @header,
|
250
|
-
@description
|
251
|
-
@conclusion)
|
256
|
+
@description)
|
252
257
|
@validation_report.errors.push 'Unexpected Error'
|
253
258
|
end
|
254
259
|
|
@@ -256,7 +261,6 @@ module GeneValidator
|
|
256
261
|
# wilcox test implementation from statsample ruby gem
|
257
262
|
# many thanks to Claudio for helping us with the implementation!
|
258
263
|
def wilcox_test(averages)
|
259
|
-
require 'statsample'
|
260
264
|
wilcox = Statsample::Test.wilcoxon_signed_rank(averages.to_scale,
|
261
265
|
Array.new(averages.length,
|
262
266
|
1).to_scale)
|
@@ -1,6 +1,11 @@
|
|
1
|
-
require '
|
2
|
-
require '
|
1
|
+
require 'forwardable'
|
2
|
+
require 'statsample'
|
3
|
+
|
4
|
+
require 'genevalidator/exceptions'
|
3
5
|
require 'genevalidator/ext/array'
|
6
|
+
require 'genevalidator/validation_report'
|
7
|
+
require 'genevalidator/validation_test'
|
8
|
+
|
4
9
|
module GeneValidator
|
5
10
|
##
|
6
11
|
# Class that stores the validation output information
|
@@ -83,26 +88,23 @@ module GeneValidator
|
|
83
88
|
# checking whether there is evidence that the
|
84
89
|
# prediction is a merge of multiple genes
|
85
90
|
class GeneMergeValidation < ValidationTest
|
86
|
-
attr_reader :hits
|
87
91
|
attr_reader :prediction
|
88
|
-
attr_reader :
|
92
|
+
attr_reader :hits
|
89
93
|
|
90
94
|
##
|
91
95
|
# Initilizes the object
|
92
96
|
# Params:
|
93
|
-
# +type+: type of the predicted sequence (:nucleotide or :protein)
|
94
97
|
# +prediction+: a +Sequence+ object representing the blast query
|
95
98
|
# +hits+: a vector of +Sequence+ objects (representing blast hits)
|
96
|
-
# +
|
99
|
+
# +plot_path+: name of the input file, used when generatig the plot files
|
97
100
|
# +boundary+: the offset of the hit from which we start analysing the hit
|
98
|
-
def initialize(
|
101
|
+
def initialize(prediction, hits, boundary = 10)
|
99
102
|
super
|
100
|
-
@short_header = '
|
103
|
+
@short_header = 'GeneMerge'
|
101
104
|
@header = 'Gene Merge'
|
102
105
|
@description = 'Check whether BLAST hits make evidence about a merge' \
|
103
106
|
' of two genes that match the predicted gene.'
|
104
107
|
@cli_name = 'merge'
|
105
|
-
@filename = filename
|
106
108
|
@boundary = boundary
|
107
109
|
end
|
108
110
|
|
@@ -158,14 +160,18 @@ module GeneValidator
|
|
158
160
|
@validation_report.plot_files.push(plot1)
|
159
161
|
plot2 = plot_matched_regions
|
160
162
|
@validation_report.plot_files.push(plot2)
|
161
|
-
@validation_report.
|
163
|
+
@validation_report.run_time = Time.now - start
|
162
164
|
@validation_report
|
163
165
|
|
164
166
|
rescue NotEnoughHitsError
|
165
167
|
@validation_report = ValidationReport.new('Not enough evidence', :warning,
|
166
168
|
@short_header, @header,
|
167
|
-
@description
|
168
|
-
|
169
|
+
@description)
|
170
|
+
rescue Exception
|
171
|
+
@validation_report = ValidationReport.new('Unexpected error', :error,
|
172
|
+
@short_header, @header,
|
173
|
+
@description)
|
174
|
+
@validation_report.errors.push 'Unexpected Error'
|
169
175
|
end
|
170
176
|
|
171
177
|
##
|
@@ -175,29 +181,25 @@ module GeneValidator
|
|
175
181
|
# +output+: location where the plot will be saved in jped file format
|
176
182
|
# +hits+: array of Sequence objects
|
177
183
|
# +prediction+: Sequence objects
|
178
|
-
def plot_matched_regions(
|
179
|
-
|
180
|
-
colors = ['orange', 'blue'] ##{colors[i%2]
|
181
|
-
f = File.open(output, 'w')
|
184
|
+
def plot_matched_regions(hits = @hits)
|
182
185
|
no_lines = hits.length
|
183
186
|
|
184
187
|
hits_less = hits[0..[no_lines, hits.length - 1].min]
|
185
188
|
|
186
|
-
|
189
|
+
data = hits_less.each_with_index.map { |hit, i|
|
187
190
|
{ 'y' => i,
|
188
191
|
'start' => hit.hsp_list.map(&:match_query_from).min,
|
189
192
|
'stop' => hit.hsp_list.map(&:match_query_to).max,
|
190
|
-
'color'=>'black',
|
191
|
-
'dotted'=>'true'}}.flatten +
|
193
|
+
'color' =>'black',
|
194
|
+
'dotted' =>'true'}}.flatten +
|
192
195
|
hits_less.each_with_index.map { |hit, i|
|
193
196
|
hit.hsp_list.map { |hsp|
|
194
197
|
{ 'y' => i,
|
195
198
|
'start' => hsp.match_query_from,
|
196
199
|
'stop' => hsp.match_query_to,
|
197
|
-
'color' => 'orange'} } }.flatten
|
198
|
-
f.close
|
200
|
+
'color' => 'orange'} } }.flatten
|
199
201
|
|
200
|
-
Plot.new(
|
202
|
+
Plot.new(data,
|
201
203
|
:lines,
|
202
204
|
'Gene Merge Validation: Query coord covered by blast hit (1 line/hit)',
|
203
205
|
'',
|
@@ -214,60 +216,24 @@ module GeneValidator
|
|
214
216
|
# +y_intercept+: the ecuation of the line is y= slope*x + y_intercept
|
215
217
|
# +output+: location where the plot will be saved in jped file format
|
216
218
|
# +hits+: array of Sequence objects
|
217
|
-
def plot_2d_start_from(slope = nil, y_intercept = nil,
|
218
|
-
output = "#{filename}_match_2d.json", hits = @hits)
|
219
|
+
def plot_2d_start_from(slope = nil, y_intercept = nil, hits = @hits)
|
219
220
|
pairs = hits.map do |hit|
|
220
221
|
Pair.new(hit.hsp_list.map(&:match_query_from).min,
|
221
222
|
hit.hsp_list.map(&:match_query_to).max)
|
222
223
|
end
|
223
224
|
|
224
|
-
|
225
|
-
yy = pairs.map(&:y)
|
226
|
-
|
227
|
-
freq_x = xx.inject(Hash.new(0)) { |h, v| h[v] += 1; h }
|
228
|
-
filename_x = "#{filename}_merge_x.json"
|
229
|
-
f = File.open(filename_x, 'w')
|
230
|
-
f.write([freq_x.collect { |k,v|
|
231
|
-
{ 'key' => k, 'value' => v, 'main' => (1==2) }
|
232
|
-
}].to_json)
|
233
|
-
f.close
|
234
|
-
plot3 = Plot.new(filename_x.scan(%r{([^/]+)$})[0][0],
|
235
|
-
:simplebars,
|
236
|
-
'[Gene Merge] X projection',
|
237
|
-
'',
|
238
|
-
'x projection',
|
239
|
-
'number of sequences')
|
240
|
-
# @validation_report.plot_files.push(plot3)
|
241
|
-
|
242
|
-
freq_y = yy.inject(Hash.new(0)) { |h,v| h[v] += 1; h }
|
243
|
-
filename_y = "#{filename}_merge_y.json"
|
244
|
-
f = File.open(filename_y, 'w')
|
245
|
-
f.write([freq_y.collect { |k, v|
|
246
|
-
{ 'key' => k, 'value' => v, 'main' => (1 == 2) }
|
247
|
-
}].to_json)
|
248
|
-
f.close
|
249
|
-
plot4 = Plot.new(filename_y.scan(%r{([^/]+)$})[0][0],
|
250
|
-
:simplebars,
|
251
|
-
'[Gene Merge] Y projection',
|
252
|
-
'',
|
253
|
-
'y projection',
|
254
|
-
'number of sequences')
|
255
|
-
# @validation_report.plot_files.push(plot4)
|
256
|
-
|
257
|
-
f = File.open(output, 'w')
|
258
|
-
f.write(hits.map { |hit| {'x' => hit.hsp_list.map(&:match_query_from).min,
|
225
|
+
data = hits.map { |hit| { 'x' => hit.hsp_list.map(&:match_query_from).min,
|
259
226
|
'y' => hit.hsp_list.map(&:match_query_to).max,
|
260
|
-
'color' => 'red'}}
|
261
|
-
f.close
|
227
|
+
'color' => 'red'}}
|
262
228
|
|
263
|
-
Plot.new(
|
229
|
+
Plot.new(data,
|
264
230
|
:scatter,
|
265
231
|
'Gene Merge Validation: Start/end of matching hit coord. on query (1 point/hit)',
|
266
232
|
'',
|
267
233
|
'Start Offset (most left hsp)',
|
268
234
|
'End Offset (most right hsp)',
|
269
|
-
y_intercept,
|
270
|
-
slope)
|
235
|
+
y_intercept.to_s,
|
236
|
+
slope.to_s)
|
271
237
|
end
|
272
238
|
|
273
239
|
##
|
@@ -314,7 +280,6 @@ module GeneValidator
|
|
314
280
|
# Output:
|
315
281
|
# The ecuation of the regression line: [y slope]
|
316
282
|
def slope_statsample(xx, yy)
|
317
|
-
require 'statsample'
|
318
283
|
sr = Statsample::Regression.simple(xx.to_scale, yy.to_scale)
|
319
284
|
[sr.a, sr.b]
|
320
285
|
end
|
@@ -1,8 +1,10 @@
|
|
1
|
-
require '
|
1
|
+
require 'forwardable'
|
2
|
+
|
2
3
|
require 'genevalidator/clusterization'
|
4
|
+
require 'genevalidator/exceptions'
|
3
5
|
require 'genevalidator/validation_report'
|
4
6
|
require 'genevalidator/validation_test'
|
5
|
-
|
7
|
+
|
6
8
|
module GeneValidator
|
7
9
|
##
|
8
10
|
# Class that stores the validation output information
|
@@ -65,7 +67,6 @@ module GeneValidator
|
|
65
67
|
# This class contains the methods necessary for
|
66
68
|
# length validation by hit length clusterization
|
67
69
|
class LengthClusterValidation < ValidationTest
|
68
|
-
attr_reader :filename
|
69
70
|
attr_reader :clusters
|
70
71
|
attr_reader :max_density_cluster
|
71
72
|
|
@@ -76,9 +77,8 @@ module GeneValidator
|
|
76
77
|
# +prediction+: a +Sequence+ object representing the blast query
|
77
78
|
# +hits+: a vector of +Sequence+ objects (representing blast hits)
|
78
79
|
# +dilename+: +String+ with the name of the fasta file
|
79
|
-
def initialize(
|
80
|
+
def initialize(prediction, hits)
|
80
81
|
super
|
81
|
-
@filename = filename
|
82
82
|
@short_header = 'LengthCluster'
|
83
83
|
@header = 'Length Cluster'
|
84
84
|
@description = 'Check whether the prediction length fits most of the' \
|
@@ -117,20 +117,18 @@ module GeneValidator
|
|
117
117
|
plot1 = plot_histo_clusters
|
118
118
|
@validation_report.plot_files.push(plot1)
|
119
119
|
|
120
|
-
@validation_report.
|
120
|
+
@validation_report.run_time = Time.now - start
|
121
121
|
|
122
122
|
@validation_report
|
123
123
|
|
124
124
|
rescue NotEnoughHitsError
|
125
125
|
@validation_report = ValidationReport.new('Not enough evidence', :warning,
|
126
126
|
@short_header, @header,
|
127
|
-
@description
|
128
|
-
@explanation, @conclusion)
|
127
|
+
@description)
|
129
128
|
rescue Exception
|
130
129
|
@validation_report = ValidationReport.new('Unexpected error', :error,
|
131
130
|
@short_header, @header,
|
132
|
-
@description
|
133
|
-
@explanation, @conclusion)
|
131
|
+
@description)
|
134
132
|
@validation_report.errors.push 'Unexpected Error'
|
135
133
|
end
|
136
134
|
|
@@ -175,25 +173,24 @@ module GeneValidator
|
|
175
173
|
##
|
176
174
|
# Generates a json file containing data used for plotting the histogram
|
177
175
|
# of the length distribution given a lust of Cluster objects
|
178
|
-
# +output+:
|
176
|
+
# +output+: plot_path where to save the graph
|
179
177
|
# +clusters+: array of +Cluster+ objects
|
180
178
|
# +max_density_cluster+: index of the most dense cluster
|
181
179
|
# +prediction+: +Sequence+ object
|
182
180
|
# Output:
|
183
181
|
# +Plot+ object
|
184
|
-
def plot_histo_clusters(output = "#{@
|
182
|
+
def plot_histo_clusters(output = "#{@plot_path}_len_clusters.json",
|
185
183
|
clusters = @clusters,
|
186
184
|
max_density_cluster = @max_density_cluster,
|
187
185
|
prediction = @prediction)
|
188
186
|
|
189
|
-
|
190
|
-
f.write(clusters.each_with_index.map { |cluster, i|
|
187
|
+
data = clusters.each_with_index.map { |cluster, i|
|
191
188
|
cluster.lengths.collect { |k, v|
|
192
189
|
{ 'key' => k, 'value' => v, 'main' => (i == max_density_cluster) }
|
193
190
|
}
|
194
|
-
}
|
195
|
-
|
196
|
-
Plot.new(
|
191
|
+
}
|
192
|
+
|
193
|
+
Plot.new(data,
|
197
194
|
:bars,
|
198
195
|
'Length Cluster Validation: Distribution of BLAST hit lengths',
|
199
196
|
'Query Sequence, black;Most Dense Cluster,red;Other Hits, blue',
|
@@ -201,41 +198,5 @@ module GeneValidator
|
|
201
198
|
'Number of Sequences',
|
202
199
|
prediction.length_protein)
|
203
200
|
end
|
204
|
-
|
205
|
-
##
|
206
|
-
# Generates a json file cotaining data used for plotting
|
207
|
-
# lines corresponding to the start and end hit offsets
|
208
|
-
# Params:
|
209
|
-
# +output+: filename where to save the graph
|
210
|
-
# +hits+: array of Sequence objects
|
211
|
-
# Output:
|
212
|
-
# +Plot+ object
|
213
|
-
def plot_len_clusters(output = "#{@filename}_len.json", _hits = @hits)
|
214
|
-
f = File.open(output, 'w')
|
215
|
-
lst = @hits.sort { |a, b| a.length_protein <=> b.length_protein }
|
216
|
-
|
217
|
-
no_lines = 100
|
218
|
-
|
219
|
-
lst_less = lst[0..[no_lines, lst.length - 1].min]
|
220
|
-
|
221
|
-
f.write((lst_less.each_with_index.map { |hit, i|
|
222
|
-
{ 'y' => i, 'start' => 0, 'stop' => hit.length_protein,
|
223
|
-
'color' => 'gray' }
|
224
|
-
} + lst_less.each_with_index.map { |hit, i|
|
225
|
-
hit.hsp_list.map { |hsp|
|
226
|
-
{ 'y' => i, 'start' => hsp.hit_from, 'stop' => hsp.hit_to,
|
227
|
-
'color' => 'red' }
|
228
|
-
}
|
229
|
-
}.flatten).to_json)
|
230
|
-
|
231
|
-
f.close
|
232
|
-
Plot.new(output.scan(%r{([^/]+)$})[0][0],
|
233
|
-
:lines,
|
234
|
-
'[Length Cluster] Matched regions in hits',
|
235
|
-
'hit, gray;high-scoring segment pairs (hsp), red',
|
236
|
-
'offset in the hit',
|
237
|
-
'number of the hit',
|
238
|
-
lst_less.length)
|
239
|
-
end
|
240
201
|
end
|
241
202
|
end
|