genevalidator 1.6.12 → 2.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +30 -1
- data/.ruby-version +1 -0
- data/.travis.yml +13 -12
- data/Gemfile +4 -1
- data/Gemfile.lock +135 -0
- data/README.md +104 -122
- data/Rakefile +377 -5
- data/aux/gv_results.slim +155 -0
- data/aux/html_files/css/gv.compiled.min.css +8 -0
- data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
- data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
- data/aux/{files → html_files}/css/src/style.css +0 -0
- data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
- data/aux/{files → html_files}/img/gene.png +0 -0
- data/aux/html_files/js/gv.compiled.min.js +1 -0
- data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
- data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
- data/aux/{files → html_files}/js/src/plots.js +1 -1
- data/aux/{files → html_files}/js/src/script.js +0 -0
- data/aux/{files → html_files}/json/.gitkeep +0 -0
- data/bin/genevalidator +393 -56
- data/exemplar_data/README.md +60 -0
- data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
- data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
- data/genevalidator.gemspec +35 -20
- data/install.sh +92 -0
- data/lib/genevalidator.rb +171 -56
- data/lib/genevalidator/arg_validation.rb +26 -55
- data/lib/genevalidator/blast.rb +44 -99
- data/lib/genevalidator/clusterization.rb +18 -22
- data/lib/genevalidator/exceptions.rb +17 -17
- data/lib/genevalidator/ext/array.rb +21 -4
- data/lib/genevalidator/get_raw_sequences.rb +32 -31
- data/lib/genevalidator/hsp.rb +31 -2
- data/lib/genevalidator/json_to_gv_results.rb +38 -122
- data/lib/genevalidator/output.rb +158 -172
- data/lib/genevalidator/output_files.rb +134 -0
- data/lib/genevalidator/pool.rb +2 -5
- data/lib/genevalidator/query.rb +1 -1
- data/lib/genevalidator/tabular_parser.rb +8 -29
- data/lib/genevalidator/validation.rb +48 -90
- data/lib/genevalidator/validation_alignment.rb +64 -75
- data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
- data/lib/genevalidator/validation_duplication.rb +85 -84
- data/lib/genevalidator/validation_gene_merge.rb +46 -35
- data/lib/genevalidator/validation_length_cluster.rb +18 -15
- data/lib/genevalidator/validation_length_rank.rb +19 -15
- data/lib/genevalidator/validation_maker_qi.rb +13 -12
- data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
- data/lib/genevalidator/validation_report.rb +1 -1
- data/lib/genevalidator/validation_test.rb +1 -1
- data/lib/genevalidator/version.rb +1 -1
- data/test/overall.rb +1 -1
- data/test/test_all_validations.rb +36 -24
- data/test/test_blast.rb +39 -24
- data/test/test_clusterization_2d.rb +4 -4
- data/test/test_helper.rb +2 -2
- data/test/test_query.rb +16 -20
- data/test/test_validation_open_reading_frame.rb +122 -122
- data/test/test_validations.rb +12 -10
- metadata +94 -79
- data/aux/files/css/genevalidator.compiled.min.css +0 -16
- data/aux/files/js/genevalidator.compiled.min.js +0 -28
- data/aux/json_footer.erb +0 -8
- data/aux/json_header.erb +0 -19
- data/aux/json_query.erb +0 -15
- data/aux/template_footer.erb +0 -8
- data/aux/template_header.erb +0 -19
- data/aux/template_query.erb +0 -14
- data/data/README.md +0 -57
- data/data/mrna_data.fasta.blast_tabular +0 -3567
- data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
- data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
- data/data/mrna_data.fasta.blast_xml +0 -39800
- data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
- data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
- data/data/mrna_data.fasta.json +0 -1
- data/data/protein_data.fasta.blast_tabular +0 -3278
- data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
- data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
- data/data/protein_data.fasta.blast_xml +0 -26228
- data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
- data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
- data/data/protein_data.fasta.json +0 -1
@@ -16,7 +16,6 @@ module GeneValidator
|
|
16
16
|
|
17
17
|
def validate_args
|
18
18
|
@opt = opt
|
19
|
-
assert_output_dir_does_not_exist
|
20
19
|
assert_file_present('input file', opt[:input_fasta_file])
|
21
20
|
assert_input_file_probably_fasta
|
22
21
|
assert_input_sequence
|
@@ -38,7 +37,7 @@ module GeneValidator
|
|
38
37
|
private
|
39
38
|
|
40
39
|
def assert_validations_arg
|
41
|
-
validations = %w
|
40
|
+
validations = %w[lenc lenr frame merge dup orf align]
|
42
41
|
if @opt[:validations]
|
43
42
|
val = @opt[:validations].collect { |v| v.strip.downcase }
|
44
43
|
validations = val unless val.include? 'all'
|
@@ -48,13 +47,13 @@ module GeneValidator
|
|
48
47
|
|
49
48
|
def check_num_threads
|
50
49
|
@opt[:num_threads] = Integer(@opt[:num_threads])
|
51
|
-
unless @opt[:num_threads]
|
52
|
-
|
53
|
-
|
50
|
+
unless @opt[:num_threads].positive?
|
51
|
+
warn 'Number of threads can not be lower than 0'
|
52
|
+
warn 'Setting number of threads to 1'
|
54
53
|
@opt[:num_threads] = 1
|
55
54
|
end
|
56
55
|
return unless @opt[:num_threads] > 256
|
57
|
-
|
56
|
+
warn "Number of threads set at #{@opt[:num_threads]} is" \
|
58
57
|
' unusually high.'
|
59
58
|
end
|
60
59
|
|
@@ -68,45 +67,35 @@ module GeneValidator
|
|
68
67
|
end
|
69
68
|
end
|
70
69
|
|
71
|
-
def assert_output_dir_does_not_exist
|
72
|
-
output_dir = "#{@opt[:input_fasta_file]}.html"
|
73
|
-
return unless File.exist?(output_dir)
|
74
|
-
$stderr.puts 'The output directory already exists for this fasta file.'
|
75
|
-
$stderr.puts "\nPlease remove the following directory: #{output_dir}\n"
|
76
|
-
$stderr.puts "You can run the following command to remove the folder.\n"
|
77
|
-
$stderr.puts "\n $ rm -r #{output_dir} \n"
|
78
|
-
exit 1
|
79
|
-
end
|
80
|
-
|
81
70
|
def assert_tabular_options_exists
|
82
71
|
return if @opt[:blast_tabular_options]
|
83
|
-
|
84
|
-
|
85
|
-
|
72
|
+
warn '*** Error: BLAST tabular options (-o) have not been set.'
|
73
|
+
warn ' Please set the "-o" option with the custom format'
|
74
|
+
warn ' used in the BLAST -outfmt argument'
|
86
75
|
exit 1
|
87
76
|
end
|
88
77
|
|
89
78
|
def assert_input_file_probably_fasta
|
90
79
|
File.open(@opt[:input_fasta_file], 'r') do |file_stream|
|
91
|
-
|
80
|
+
file_stream.readline[0] == '>'
|
92
81
|
end
|
93
82
|
end
|
94
83
|
|
95
84
|
def assert_file_present(desc, file, exit_code = 1)
|
96
85
|
return if file && File.exist?(File.expand_path(file))
|
97
|
-
|
86
|
+
warn "*** Error: Couldn't find the #{desc}: #{file}."
|
98
87
|
exit exit_code
|
99
88
|
end
|
100
89
|
|
101
|
-
|
90
|
+
alias assert_dir_present assert_file_present
|
102
91
|
|
103
92
|
def assert_input_sequence
|
104
93
|
fasta_content = IO.binread(@opt[:input_fasta_file])
|
105
94
|
type = BlastUtils.type_of_sequences(fasta_content)
|
106
|
-
return if
|
107
|
-
|
108
|
-
|
109
|
-
|
95
|
+
return if %i[nucleotide protein].include? type
|
96
|
+
warn '*** Error: The input files does not contain just protein'
|
97
|
+
warn ' or nucleotide data.'
|
98
|
+
warn ' Please correct this and try again.'
|
110
99
|
exit 1
|
111
100
|
end
|
112
101
|
|
@@ -116,8 +105,8 @@ module GeneValidator
|
|
116
105
|
if File.exist?(bin) && File.directory?(bin)
|
117
106
|
add_to_path(bin)
|
118
107
|
else
|
119
|
-
|
120
|
-
|
108
|
+
warn '*** The following bin directory does not exist:'
|
109
|
+
warn " #{bin}"
|
121
110
|
end
|
122
111
|
end
|
123
112
|
end
|
@@ -131,10 +120,10 @@ module GeneValidator
|
|
131
120
|
|
132
121
|
def assert_mafft_installation
|
133
122
|
return if command?('mafft')
|
134
|
-
|
135
|
-
|
123
|
+
warn '*** Could not find Mafft binaries.'
|
124
|
+
warn ' Ignoring error and continuing - Please note that' \
|
136
125
|
' some validations may be skipped.'
|
137
|
-
|
126
|
+
warn # a blank line
|
138
127
|
end
|
139
128
|
end
|
140
129
|
|
@@ -142,7 +131,7 @@ module GeneValidator
|
|
142
131
|
class Blast
|
143
132
|
class << self
|
144
133
|
# Use a fixed minimum version of BLAST+
|
145
|
-
MINIMUM_BLAST_VERSION = '2.2.30+'
|
134
|
+
MINIMUM_BLAST_VERSION = '2.2.30+'.freeze
|
146
135
|
# Use the following exit codes, or 1.
|
147
136
|
EXIT_BLAST_NOT_INSTALLED = 2
|
148
137
|
EXIT_BLAST_NOT_COMPATIBLE = 3
|
@@ -150,7 +139,6 @@ module GeneValidator
|
|
150
139
|
|
151
140
|
def validate(opt)
|
152
141
|
assert_blast_installation
|
153
|
-
warn_if_remote_database(opt)
|
154
142
|
assert_local_blast_database_exists(opt[:db]) if opt[:db] !~ /remote/
|
155
143
|
end
|
156
144
|
|
@@ -160,27 +148,10 @@ module GeneValidator
|
|
160
148
|
assert_blast_compatible
|
161
149
|
end
|
162
150
|
|
163
|
-
def warn_if_remote_database(opt)
|
164
|
-
return if opt[:db] !~ /remote/
|
165
|
-
$stderr.puts # a blank line
|
166
|
-
if !opt[:raw_sequences] &&
|
167
|
-
(opt[:validations].include?('align') ||
|
168
|
-
opt[:validations].include?('dup'))
|
169
|
-
$stderr.puts 'Warning: Hit sequences will be fetched from remote' \
|
170
|
-
' server.'
|
171
|
-
else
|
172
|
-
$stderr.puts 'Warning: BLAST will be carried out on remote server.'
|
173
|
-
end
|
174
|
-
$stderr.puts 'This may take quite a bit of time.'
|
175
|
-
$stderr.puts 'You may want to install a local BLAST database for' \
|
176
|
-
' faster analyses.'
|
177
|
-
$stderr.puts # a blank line
|
178
|
-
end
|
179
|
-
|
180
151
|
def assert_local_blast_database_exists(db)
|
181
152
|
return if system("blastdbcmd -db #{db} -info > /dev/null 2>&1")
|
182
|
-
|
183
|
-
|
153
|
+
warn '*** No BLAST database found at the provided path.'
|
154
|
+
warn ' Please ensure that the provided path is correct' \
|
184
155
|
' and then try again.'
|
185
156
|
exit EXIT_NO_BLAST_DATABASE
|
186
157
|
end
|
@@ -189,15 +160,15 @@ module GeneValidator
|
|
189
160
|
|
190
161
|
def assert_blast_installed
|
191
162
|
return if GVArgValidation.command?('blastdbcmd')
|
192
|
-
|
163
|
+
warn '*** Could not find BLAST+ binaries.'
|
193
164
|
exit EXIT_BLAST_NOT_INSTALLED
|
194
165
|
end
|
195
166
|
|
196
167
|
def assert_blast_compatible
|
197
168
|
version = `blastdbcmd -version`.split[1]
|
198
169
|
return if version >= MINIMUM_BLAST_VERSION
|
199
|
-
|
200
|
-
|
170
|
+
warn "*** Your BLAST+ version #{version} is outdated."
|
171
|
+
warn ' GeneValidator needs NCBI BLAST+ version' \
|
201
172
|
" #{MINIMUM_BLAST_VERSION} or higher."
|
202
173
|
exit EXIT_BLAST_NOT_COMPATIBLE
|
203
174
|
end
|
data/lib/genevalidator/blast.rb
CHANGED
@@ -12,34 +12,10 @@ module GeneValidator
|
|
12
12
|
class BlastUtils
|
13
13
|
class << self
|
14
14
|
extend Forwardable
|
15
|
-
def_delegators GeneValidator, :opt, :config
|
15
|
+
def_delegators GeneValidator, :opt, :config, :dirs
|
16
16
|
|
17
17
|
EVALUE = 1e-5
|
18
18
|
|
19
|
-
##
|
20
|
-
# Calls blast from standard input with specific parameters
|
21
|
-
# Params:
|
22
|
-
# +blast_type+: blast command in String format (e.g 'blast(x/p)')
|
23
|
-
# +query+: String containing the the query in fasta format
|
24
|
-
# +db+: database
|
25
|
-
# +num_threads+: The number of threads to run BLAST with.
|
26
|
-
# Output:
|
27
|
-
# String with the blast xml output
|
28
|
-
def run_blast(query, db = opt[:db], seq_type = config[:type],
|
29
|
-
num_threads = opt[:num_threads])
|
30
|
-
|
31
|
-
blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
|
32
|
-
# -num_threads is not supported on remote databases
|
33
|
-
threads = (db !~ /remote/) ? "-num_threads #{num_threads}" : ''
|
34
|
-
|
35
|
-
blastcmd = "#{blast_type} -db '#{db}' -evalue #{EVALUE} -outfmt 5" \
|
36
|
-
" #{threads}"
|
37
|
-
|
38
|
-
cmd = "echo \"#{query}\" | #{blastcmd}"
|
39
|
-
`#{cmd} >/dev/null 2>&1`
|
40
|
-
end
|
41
|
-
|
42
|
-
##
|
43
19
|
# Runs BLAST on an input file
|
44
20
|
# Params:
|
45
21
|
# +blast_type+: blast command in String format (e.g 'blastx' or 'blastp')
|
@@ -49,32 +25,22 @@ module GeneValidator
|
|
49
25
|
# +nr_hits+: max number of hits
|
50
26
|
# Output:
|
51
27
|
# XML file
|
52
|
-
def run_blast_on_input_file
|
53
|
-
|
54
|
-
|
55
|
-
return if opt[:blast_xml_file] || opt[:blast_tabular_file]
|
28
|
+
def run_blast_on_input_file
|
29
|
+
remote = opt[:db].match?(/remote/) ? true : false
|
30
|
+
print_blast_info_text(remote)
|
56
31
|
|
57
|
-
|
58
|
-
opt
|
32
|
+
log_file = File.join(dirs[:tmp_dir], 'blast_cmd_output.txt')
|
33
|
+
`#{blast_cmd(opt, config, remote)} > #{log_file} 2>&1`
|
59
34
|
|
60
|
-
blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
|
61
|
-
# -num_threads is not supported on remote databases
|
62
|
-
threads = (opt[:db] !~ /remote/) ? "-num_threads #{num_threads}" : ''
|
63
|
-
|
64
|
-
blastcmd = "#{blast_type} -query '#{input_file}'" \
|
65
|
-
" -out '#{opt[:blast_xml_file]}' -db #{db} " \
|
66
|
-
" -evalue #{EVALUE} -outfmt 5 #{threads}"
|
67
|
-
|
68
|
-
`#{blastcmd} >/dev/null 2>&1`
|
69
35
|
return unless File.zero?(opt[:blast_xml_file])
|
70
|
-
|
71
|
-
if
|
72
|
-
|
73
|
-
|
36
|
+
warn 'Blast failed to run on the input file.'
|
37
|
+
if remote
|
38
|
+
warn 'You are using BLAST with a remote database. Please'
|
39
|
+
warn 'ensure that you have internet access and try again.'
|
74
40
|
else
|
75
|
-
|
76
|
-
$stderr.puts 'ensure that you have internet access and try again.'
|
41
|
+
warn 'Please ensure that the BLAST database exists and try again.'
|
77
42
|
end
|
43
|
+
exit 1
|
78
44
|
end
|
79
45
|
|
80
46
|
##
|
@@ -84,66 +50,24 @@ module GeneValidator
|
|
84
50
|
# +type+: the type of the sequence: :nucleotide or :protein
|
85
51
|
# Outputs:
|
86
52
|
# Array of +Sequence+ objects corresponding to the list of hits
|
87
|
-
def parse_next(iterator
|
88
|
-
hits = []
|
53
|
+
def parse_next(iterator)
|
89
54
|
iter = iterator.next
|
90
55
|
|
91
56
|
# parse blast the xml output and get the hits
|
92
57
|
# hits obtained are proteins! (we use only blastp and blastx)
|
58
|
+
hits = []
|
93
59
|
iter.each do |hit|
|
94
|
-
seq
|
95
|
-
|
60
|
+
seq = Query.new
|
96
61
|
seq.length_protein = hit.len.to_i
|
97
62
|
seq.type = :protein
|
98
63
|
seq.identifier = hit.hit_id
|
99
64
|
seq.definition = hit.hit_def
|
100
|
-
seq.accession_no
|
101
|
-
|
102
|
-
# get all high-scoring segment pairs (hsp)
|
103
|
-
hsps = []
|
104
|
-
|
105
|
-
hit.hsps.each do |hsp|
|
106
|
-
current_hsp = Hsp.new
|
107
|
-
current_hsp.hsp_evalue = format('%.0e', hsp.evalue)
|
65
|
+
seq.accession_no = hit.accession
|
66
|
+
seq.hsp_list = hit.hsps.map { |hsp| Hsp.new(xml_input: hsp) }
|
108
67
|
|
109
|
-
|
110
|
-
current_hsp.hit_to = hsp.hit_to.to_i
|
111
|
-
current_hsp.match_query_from = hsp.query_from.to_i
|
112
|
-
current_hsp.match_query_to = hsp.query_to.to_i
|
113
|
-
|
114
|
-
if type == :nucleotide
|
115
|
-
current_hsp.match_query_from /= 3
|
116
|
-
current_hsp.match_query_to /= 3
|
117
|
-
current_hsp.match_query_from += 1
|
118
|
-
current_hsp.match_query_to += 1
|
119
|
-
end
|
120
|
-
|
121
|
-
current_hsp.query_reading_frame = hsp.query_frame.to_i
|
122
|
-
|
123
|
-
current_hsp.hit_alignment = hsp.hseq.to_s
|
124
|
-
seq_type = guess_sequence_type(current_hsp.hit_alignment)
|
125
|
-
fail SequenceTypeError unless seq_type == :protein || seq_type.nil?
|
126
|
-
|
127
|
-
current_hsp.query_alignment = hsp.qseq.to_s
|
128
|
-
seq_type = guess_sequence_type(current_hsp.query_alignment)
|
129
|
-
fail SequenceTypeError unless seq_type == :protein || seq_type.nil?
|
130
|
-
|
131
|
-
current_hsp.align_len = hsp.align_len.to_i
|
132
|
-
current_hsp.identity = hsp.identity.to_i
|
133
|
-
current_hsp.pidentity = (100 * hsp.identity / hsp.align_len.to_f)
|
134
|
-
.round(2)
|
135
|
-
|
136
|
-
hsps.push(current_hsp)
|
137
|
-
end
|
138
|
-
|
139
|
-
seq.hsp_list = hsps
|
140
|
-
hits.push(seq)
|
68
|
+
hits << seq
|
141
69
|
end
|
142
|
-
|
143
70
|
hits
|
144
|
-
rescue SequenceTypeError => e
|
145
|
-
$stderr.puts e
|
146
|
-
exit 1
|
147
71
|
rescue StopIteration
|
148
72
|
nil
|
149
73
|
end
|
@@ -164,7 +88,7 @@ module GeneValidator
|
|
164
88
|
sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?)
|
165
89
|
# get all sequence types
|
166
90
|
sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }
|
167
|
-
|
91
|
+
.uniq.compact
|
168
92
|
|
169
93
|
return nil if sequence_types.empty?
|
170
94
|
sequence_types.first if sequence_types.length == 1
|
@@ -184,7 +108,7 @@ module GeneValidator
|
|
184
108
|
return nil if cleaned_sequence.length < 10 # conservative
|
185
109
|
|
186
110
|
type = Bio::Sequence.new(cleaned_sequence).guess(0.9)
|
187
|
-
|
111
|
+
type == Bio::Sequence::NA ? :nucleotide : :protein
|
188
112
|
end
|
189
113
|
|
190
114
|
##
|
@@ -192,11 +116,32 @@ module GeneValidator
|
|
192
116
|
def guess_sequence_type_from_input_file(file = opt[:input_fasta_file])
|
193
117
|
lines = File.foreach(file).first(10)
|
194
118
|
seqs = ''
|
195
|
-
lines.each
|
196
|
-
seqs += l.chomp unless l[0] == '>'
|
197
|
-
end
|
119
|
+
lines.each { |l| seqs += l.chomp unless l[0] == '>' }
|
198
120
|
guess_sequence_type(seqs)
|
199
121
|
end
|
122
|
+
|
123
|
+
private
|
124
|
+
|
125
|
+
def blast_cmd(opt, config, remote)
|
126
|
+
blast_type = config[:type] == :protein ? 'blastp' : 'blastx'
|
127
|
+
# -num_threads is not supported on remote databases
|
128
|
+
threads = remote ? '' : "-num_threads #{opt[:num_threads]}"
|
129
|
+
|
130
|
+
"#{blast_type} -query '#{opt[:input_fasta_file]}'" \
|
131
|
+
" -db #{opt[:db]} -outfmt 5 -evalue #{EVALUE} #{threads}" \
|
132
|
+
" -out '#{opt[:blast_xml_file]}' #{opt[:blast_options]}"
|
133
|
+
end
|
134
|
+
|
135
|
+
def print_blast_info_text(remote)
|
136
|
+
warn '' # a blank line
|
137
|
+
if remote
|
138
|
+
warn '==> BLAST search and subsequent analysis will be done on a remote'
|
139
|
+
warn ' database. Please use a local database for larger analysis.'
|
140
|
+
else
|
141
|
+
warn '==> Running BLAST. This may take a while.'
|
142
|
+
end
|
143
|
+
warn '' # a blank line
|
144
|
+
end
|
200
145
|
end
|
201
146
|
end
|
202
147
|
end
|
@@ -13,7 +13,7 @@ module GeneValidator
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def print
|
16
|
-
|
16
|
+
warn "Cluster: #{x} #{y}"
|
17
17
|
end
|
18
18
|
|
19
19
|
##
|
@@ -44,7 +44,7 @@ module GeneValidator
|
|
44
44
|
# Overload quality operator
|
45
45
|
# Returns true if the pairs are equal, false otherwise
|
46
46
|
def ==(other)
|
47
|
-
|
47
|
+
other.x == x && other.y == y ? true : false
|
48
48
|
end
|
49
49
|
|
50
50
|
def eql?(other)
|
@@ -66,7 +66,7 @@ module GeneValidator
|
|
66
66
|
|
67
67
|
def print
|
68
68
|
objects.each do |elem|
|
69
|
-
|
69
|
+
warn "(#{elem[0].x},#{elem[0].y}): #{elem[1]}"
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
@@ -254,7 +254,7 @@ module GeneValidator
|
|
254
254
|
# Real number
|
255
255
|
def deviation(clusters, queryLength)
|
256
256
|
hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten
|
257
|
-
raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.
|
257
|
+
raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.delete('[').delete(']')
|
258
258
|
R.eval("sd = sd(c(#{raw_hits}))")
|
259
259
|
sd = R.pull('sd')
|
260
260
|
sd = standard_deviation(hits)
|
@@ -273,11 +273,11 @@ module GeneValidator
|
|
273
273
|
##
|
274
274
|
# Prints the current cluster
|
275
275
|
def print
|
276
|
-
|
276
|
+
warn "Cluster: mean = #{mean}, density = #{density}"
|
277
277
|
lengths.sort { |a, b| a <=> b }.each do |elem|
|
278
|
-
|
278
|
+
warn "#{elem[0]}, #{elem[1]}"
|
279
279
|
end
|
280
|
-
|
280
|
+
warn '--------------------------'
|
281
281
|
end
|
282
282
|
|
283
283
|
##
|
@@ -337,7 +337,7 @@ module GeneValidator
|
|
337
337
|
# clusters = array of clusters
|
338
338
|
# initially each length belongs to a different cluster
|
339
339
|
histogram.each do |e|
|
340
|
-
|
340
|
+
warn "pair (#{e[0].x} #{e[0].y}) appears #{e[1]} times" if debug
|
341
341
|
hash = { e[0] => e[1] }
|
342
342
|
cluster = PairCluster.new(hash)
|
343
343
|
clusters.push(cluster)
|
@@ -355,7 +355,7 @@ module GeneValidator
|
|
355
355
|
break if no_clusters != 0 && clusters.length == no_clusters
|
356
356
|
|
357
357
|
iteration += iteration
|
358
|
-
|
358
|
+
warn "\nIteration #{iteration}" if debug
|
359
359
|
|
360
360
|
min_distance = 100_000_000
|
361
361
|
cluster1 = 0
|
@@ -365,9 +365,7 @@ module GeneValidator
|
|
365
365
|
[*(0..(clusters.length - 2))].each do |i|
|
366
366
|
[*((i + 1)..(clusters.length - 1))].each do |j|
|
367
367
|
dist = clusters[i].distance(clusters[j], distance_method)
|
368
|
-
if debug
|
369
|
-
$stderr.puts "distance between clusters #{i} and #{j} is #{dist}"
|
370
|
-
end
|
368
|
+
warn "distance between clusters #{i} and #{j} is #{dist}" if debug
|
371
369
|
current_density = clusters[i].density + clusters[j].density
|
372
370
|
if dist < min_distance
|
373
371
|
min_distance = dist
|
@@ -383,14 +381,14 @@ module GeneValidator
|
|
383
381
|
end
|
384
382
|
|
385
383
|
# merge clusters 'cluster1' and 'cluster2'
|
386
|
-
|
384
|
+
warn "clusters to merge #{cluster1} and #{cluster2}" if debug
|
387
385
|
|
388
386
|
clusters[cluster1].add(clusters[cluster2])
|
389
387
|
clusters.delete_at(cluster2)
|
390
388
|
|
391
389
|
if debug
|
392
390
|
clusters.each_with_index do |elem, i|
|
393
|
-
|
391
|
+
warn "cluster #{i}"
|
394
392
|
elem.print
|
395
393
|
end
|
396
394
|
end
|
@@ -437,8 +435,8 @@ module GeneValidator
|
|
437
435
|
|
438
436
|
# clusters = array of clusters
|
439
437
|
# initially each length belongs to a different cluster
|
440
|
-
histogram.
|
441
|
-
|
438
|
+
histogram.sort_by { |a| a[0] }.each do |elem|
|
439
|
+
warn "len #{elem[0]} appears #{elem[1]} times" if debug
|
442
440
|
hash = { elem[0] => elem[1] }
|
443
441
|
cluster = Cluster.new(hash)
|
444
442
|
clusters.push(cluster)
|
@@ -456,7 +454,7 @@ module GeneValidator
|
|
456
454
|
break if no_clusters != 0 && clusters.length == no_clusters
|
457
455
|
|
458
456
|
iteration += iteration
|
459
|
-
|
457
|
+
warn "\nIteration #{iteration}" if debug
|
460
458
|
|
461
459
|
min_distance = 100_000_000
|
462
460
|
cluster = 0
|
@@ -464,9 +462,7 @@ module GeneValidator
|
|
464
462
|
|
465
463
|
clusters[0..clusters.length - 2].each_with_index do |_item, i|
|
466
464
|
dist = clusters[i].distance(clusters[i + 1], distance_method)
|
467
|
-
if debug
|
468
|
-
$stderr.puts "distance btwn clusters #{i} and #{i + 1} is #{dist}"
|
469
|
-
end
|
465
|
+
warn "distance btwn clusters #{i} and #{i + 1} is #{dist}" if debug
|
470
466
|
current_density = clusters[i].density + clusters[i + 1].density
|
471
467
|
if dist < min_distance
|
472
468
|
min_distance = dist
|
@@ -485,14 +481,14 @@ module GeneValidator
|
|
485
481
|
end
|
486
482
|
|
487
483
|
# merge clusters 'cluster' and 'cluster'+1
|
488
|
-
|
484
|
+
warn "clusters to merge #{cluster} and #{cluster + 1}" if debug
|
489
485
|
|
490
486
|
clusters[cluster].add(clusters[cluster + 1])
|
491
487
|
clusters.delete_at(cluster + 1)
|
492
488
|
|
493
489
|
if debug
|
494
490
|
clusters.each_with_index do |elem, i|
|
495
|
-
|
491
|
+
warn "cluster #{i}"
|
496
492
|
elem.print
|
497
493
|
end
|
498
494
|
end
|