genevalidator 1.6.12 → 2.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +30 -1
  3. data/.ruby-version +1 -0
  4. data/.travis.yml +13 -12
  5. data/Gemfile +4 -1
  6. data/Gemfile.lock +135 -0
  7. data/README.md +104 -122
  8. data/Rakefile +377 -5
  9. data/aux/gv_results.slim +155 -0
  10. data/aux/html_files/css/gv.compiled.min.css +8 -0
  11. data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
  12. data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
  13. data/aux/{files → html_files}/css/src/style.css +0 -0
  14. data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
  15. data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
  16. data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
  17. data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
  18. data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
  19. data/aux/{files → html_files}/img/gene.png +0 -0
  20. data/aux/html_files/js/gv.compiled.min.js +1 -0
  21. data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
  22. data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
  23. data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
  24. data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
  25. data/aux/{files → html_files}/js/src/plots.js +1 -1
  26. data/aux/{files → html_files}/js/src/script.js +0 -0
  27. data/aux/{files → html_files}/json/.gitkeep +0 -0
  28. data/bin/genevalidator +393 -56
  29. data/exemplar_data/README.md +60 -0
  30. data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
  31. data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
  32. data/genevalidator.gemspec +35 -20
  33. data/install.sh +92 -0
  34. data/lib/genevalidator.rb +171 -56
  35. data/lib/genevalidator/arg_validation.rb +26 -55
  36. data/lib/genevalidator/blast.rb +44 -99
  37. data/lib/genevalidator/clusterization.rb +18 -22
  38. data/lib/genevalidator/exceptions.rb +17 -17
  39. data/lib/genevalidator/ext/array.rb +21 -4
  40. data/lib/genevalidator/get_raw_sequences.rb +32 -31
  41. data/lib/genevalidator/hsp.rb +31 -2
  42. data/lib/genevalidator/json_to_gv_results.rb +38 -122
  43. data/lib/genevalidator/output.rb +158 -172
  44. data/lib/genevalidator/output_files.rb +134 -0
  45. data/lib/genevalidator/pool.rb +2 -5
  46. data/lib/genevalidator/query.rb +1 -1
  47. data/lib/genevalidator/tabular_parser.rb +8 -29
  48. data/lib/genevalidator/validation.rb +48 -90
  49. data/lib/genevalidator/validation_alignment.rb +64 -75
  50. data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
  51. data/lib/genevalidator/validation_duplication.rb +85 -84
  52. data/lib/genevalidator/validation_gene_merge.rb +46 -35
  53. data/lib/genevalidator/validation_length_cluster.rb +18 -15
  54. data/lib/genevalidator/validation_length_rank.rb +19 -15
  55. data/lib/genevalidator/validation_maker_qi.rb +13 -12
  56. data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
  57. data/lib/genevalidator/validation_report.rb +1 -1
  58. data/lib/genevalidator/validation_test.rb +1 -1
  59. data/lib/genevalidator/version.rb +1 -1
  60. data/test/overall.rb +1 -1
  61. data/test/test_all_validations.rb +36 -24
  62. data/test/test_blast.rb +39 -24
  63. data/test/test_clusterization_2d.rb +4 -4
  64. data/test/test_helper.rb +2 -2
  65. data/test/test_query.rb +16 -20
  66. data/test/test_validation_open_reading_frame.rb +122 -122
  67. data/test/test_validations.rb +12 -10
  68. metadata +94 -79
  69. data/aux/files/css/genevalidator.compiled.min.css +0 -16
  70. data/aux/files/js/genevalidator.compiled.min.js +0 -28
  71. data/aux/json_footer.erb +0 -8
  72. data/aux/json_header.erb +0 -19
  73. data/aux/json_query.erb +0 -15
  74. data/aux/template_footer.erb +0 -8
  75. data/aux/template_header.erb +0 -19
  76. data/aux/template_query.erb +0 -14
  77. data/data/README.md +0 -57
  78. data/data/mrna_data.fasta.blast_tabular +0 -3567
  79. data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
  80. data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
  81. data/data/mrna_data.fasta.blast_xml +0 -39800
  82. data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
  83. data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
  84. data/data/mrna_data.fasta.json +0 -1
  85. data/data/protein_data.fasta.blast_tabular +0 -3278
  86. data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
  87. data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
  88. data/data/protein_data.fasta.blast_xml +0 -26228
  89. data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
  90. data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
  91. data/data/protein_data.fasta.json +0 -1
@@ -16,7 +16,6 @@ module GeneValidator
16
16
 
17
17
  def validate_args
18
18
  @opt = opt
19
- assert_output_dir_does_not_exist
20
19
  assert_file_present('input file', opt[:input_fasta_file])
21
20
  assert_input_file_probably_fasta
22
21
  assert_input_sequence
@@ -38,7 +37,7 @@ module GeneValidator
38
37
  private
39
38
 
40
39
  def assert_validations_arg
41
- validations = %w(lenc lenr frame merge dup orf align)
40
+ validations = %w[lenc lenr frame merge dup orf align]
42
41
  if @opt[:validations]
43
42
  val = @opt[:validations].collect { |v| v.strip.downcase }
44
43
  validations = val unless val.include? 'all'
@@ -48,13 +47,13 @@ module GeneValidator
48
47
 
49
48
  def check_num_threads
50
49
  @opt[:num_threads] = Integer(@opt[:num_threads])
51
- unless @opt[:num_threads] > 0
52
- $stderr.puts 'Number of threads can not be lower than 0'
53
- $stderr.puts 'Setting number of threads to 1'
50
+ unless @opt[:num_threads].positive?
51
+ warn 'Number of threads can not be lower than 0'
52
+ warn 'Setting number of threads to 1'
54
53
  @opt[:num_threads] = 1
55
54
  end
56
55
  return unless @opt[:num_threads] > 256
57
- $stderr.puts "Number of threads set at #{@opt[:num_threads]} is" \
56
+ warn "Number of threads set at #{@opt[:num_threads]} is" \
58
57
  ' unusually high.'
59
58
  end
60
59
 
@@ -68,45 +67,35 @@ module GeneValidator
68
67
  end
69
68
  end
70
69
 
71
- def assert_output_dir_does_not_exist
72
- output_dir = "#{@opt[:input_fasta_file]}.html"
73
- return unless File.exist?(output_dir)
74
- $stderr.puts 'The output directory already exists for this fasta file.'
75
- $stderr.puts "\nPlease remove the following directory: #{output_dir}\n"
76
- $stderr.puts "You can run the following command to remove the folder.\n"
77
- $stderr.puts "\n $ rm -r #{output_dir} \n"
78
- exit 1
79
- end
80
-
81
70
  def assert_tabular_options_exists
82
71
  return if @opt[:blast_tabular_options]
83
- $stderr.puts '*** Error: BLAST tabular options (-o) have not been set.'
84
- $stderr.puts ' Please set the "-o" option with the custom format'
85
- $stderr.puts ' used in the BLAST -outfmt argument'
72
+ warn '*** Error: BLAST tabular options (-o) have not been set.'
73
+ warn ' Please set the "-o" option with the custom format'
74
+ warn ' used in the BLAST -outfmt argument'
86
75
  exit 1
87
76
  end
88
77
 
89
78
  def assert_input_file_probably_fasta
90
79
  File.open(@opt[:input_fasta_file], 'r') do |file_stream|
91
- (file_stream.readline[0] == '>') ? true : false
80
+ file_stream.readline[0] == '>'
92
81
  end
93
82
  end
94
83
 
95
84
  def assert_file_present(desc, file, exit_code = 1)
96
85
  return if file && File.exist?(File.expand_path(file))
97
- $stderr.puts "*** Error: Couldn't find the #{desc}: #{file}."
86
+ warn "*** Error: Couldn't find the #{desc}: #{file}."
98
87
  exit exit_code
99
88
  end
100
89
 
101
- alias_method :assert_dir_present, :assert_file_present
90
+ alias assert_dir_present assert_file_present
102
91
 
103
92
  def assert_input_sequence
104
93
  fasta_content = IO.binread(@opt[:input_fasta_file])
105
94
  type = BlastUtils.type_of_sequences(fasta_content)
106
- return if type == :nucleotide || type == :protein
107
- $stderr.puts '*** Error: The input files does not contain just protein'
108
- $stderr.puts ' or nucleotide data.'
109
- $stderr.puts ' Please correct this and try again.'
95
+ return if %i[nucleotide protein].include? type
96
+ warn '*** Error: The input files does not contain just protein'
97
+ warn ' or nucleotide data.'
98
+ warn ' Please correct this and try again.'
110
99
  exit 1
111
100
  end
112
101
 
@@ -116,8 +105,8 @@ module GeneValidator
116
105
  if File.exist?(bin) && File.directory?(bin)
117
106
  add_to_path(bin)
118
107
  else
119
- $stderr.puts '*** The following bin directory does not exist:'
120
- $stderr.puts " #{bin}"
108
+ warn '*** The following bin directory does not exist:'
109
+ warn " #{bin}"
121
110
  end
122
111
  end
123
112
  end
@@ -131,10 +120,10 @@ module GeneValidator
131
120
 
132
121
  def assert_mafft_installation
133
122
  return if command?('mafft')
134
- $stderr.puts '*** Could not find Mafft binaries.'
135
- $stderr.puts ' Ignoring error and continuing - Please note that' \
123
+ warn '*** Could not find Mafft binaries.'
124
+ warn ' Ignoring error and continuing - Please note that' \
136
125
  ' some validations may be skipped.'
137
- $stderr.puts # a blank line
126
+ warn # a blank line
138
127
  end
139
128
  end
140
129
 
@@ -142,7 +131,7 @@ module GeneValidator
142
131
  class Blast
143
132
  class << self
144
133
  # Use a fixed minimum version of BLAST+
145
- MINIMUM_BLAST_VERSION = '2.2.30+'
134
+ MINIMUM_BLAST_VERSION = '2.2.30+'.freeze
146
135
  # Use the following exit codes, or 1.
147
136
  EXIT_BLAST_NOT_INSTALLED = 2
148
137
  EXIT_BLAST_NOT_COMPATIBLE = 3
@@ -150,7 +139,6 @@ module GeneValidator
150
139
 
151
140
  def validate(opt)
152
141
  assert_blast_installation
153
- warn_if_remote_database(opt)
154
142
  assert_local_blast_database_exists(opt[:db]) if opt[:db] !~ /remote/
155
143
  end
156
144
 
@@ -160,27 +148,10 @@ module GeneValidator
160
148
  assert_blast_compatible
161
149
  end
162
150
 
163
- def warn_if_remote_database(opt)
164
- return if opt[:db] !~ /remote/
165
- $stderr.puts # a blank line
166
- if !opt[:raw_sequences] &&
167
- (opt[:validations].include?('align') ||
168
- opt[:validations].include?('dup'))
169
- $stderr.puts 'Warning: Hit sequences will be fetched from remote' \
170
- ' server.'
171
- else
172
- $stderr.puts 'Warning: BLAST will be carried out on remote server.'
173
- end
174
- $stderr.puts 'This may take quite a bit of time.'
175
- $stderr.puts 'You may want to install a local BLAST database for' \
176
- ' faster analyses.'
177
- $stderr.puts # a blank line
178
- end
179
-
180
151
  def assert_local_blast_database_exists(db)
181
152
  return if system("blastdbcmd -db #{db} -info > /dev/null 2>&1")
182
- $stderr.puts '*** No BLAST database found at the provided path.'
183
- $stderr.puts ' Please ensure that the provided path is correct' \
153
+ warn '*** No BLAST database found at the provided path.'
154
+ warn ' Please ensure that the provided path is correct' \
184
155
  ' and then try again.'
185
156
  exit EXIT_NO_BLAST_DATABASE
186
157
  end
@@ -189,15 +160,15 @@ module GeneValidator
189
160
 
190
161
  def assert_blast_installed
191
162
  return if GVArgValidation.command?('blastdbcmd')
192
- $stderr.puts '*** Could not find BLAST+ binaries.'
163
+ warn '*** Could not find BLAST+ binaries.'
193
164
  exit EXIT_BLAST_NOT_INSTALLED
194
165
  end
195
166
 
196
167
  def assert_blast_compatible
197
168
  version = `blastdbcmd -version`.split[1]
198
169
  return if version >= MINIMUM_BLAST_VERSION
199
- $stderr.puts "*** Your BLAST+ version #{version} is outdated."
200
- $stderr.puts ' GeneValidator needs NCBI BLAST+ version' \
170
+ warn "*** Your BLAST+ version #{version} is outdated."
171
+ warn ' GeneValidator needs NCBI BLAST+ version' \
201
172
  " #{MINIMUM_BLAST_VERSION} or higher."
202
173
  exit EXIT_BLAST_NOT_COMPATIBLE
203
174
  end
@@ -12,34 +12,10 @@ module GeneValidator
12
12
  class BlastUtils
13
13
  class << self
14
14
  extend Forwardable
15
- def_delegators GeneValidator, :opt, :config
15
+ def_delegators GeneValidator, :opt, :config, :dirs
16
16
 
17
17
  EVALUE = 1e-5
18
18
 
19
- ##
20
- # Calls blast from standard input with specific parameters
21
- # Params:
22
- # +blast_type+: blast command in String format (e.g 'blast(x/p)')
23
- # +query+: String containing the the query in fasta format
24
- # +db+: database
25
- # +num_threads+: The number of threads to run BLAST with.
26
- # Output:
27
- # String with the blast xml output
28
- def run_blast(query, db = opt[:db], seq_type = config[:type],
29
- num_threads = opt[:num_threads])
30
-
31
- blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
32
- # -num_threads is not supported on remote databases
33
- threads = (db !~ /remote/) ? "-num_threads #{num_threads}" : ''
34
-
35
- blastcmd = "#{blast_type} -db '#{db}' -evalue #{EVALUE} -outfmt 5" \
36
- " #{threads}"
37
-
38
- cmd = "echo \"#{query}\" | #{blastcmd}"
39
- `#{cmd} >/dev/null 2>&1`
40
- end
41
-
42
- ##
43
19
  # Runs BLAST on an input file
44
20
  # Params:
45
21
  # +blast_type+: blast command in String format (e.g 'blastx' or 'blastp')
@@ -49,32 +25,22 @@ module GeneValidator
49
25
  # +nr_hits+: max number of hits
50
26
  # Output:
51
27
  # XML file
52
- def run_blast_on_input_file(input_file = opt[:input_fasta_file],
53
- db = opt[:db], seq_type = config[:type],
54
- num_threads = opt[:num_threads])
55
- return if opt[:blast_xml_file] || opt[:blast_tabular_file]
28
+ def run_blast_on_input_file
29
+ remote = opt[:db].match?(/remote/) ? true : false
30
+ print_blast_info_text(remote)
56
31
 
57
- $stderr.puts 'Running BLAST. This may take a while.'
58
- opt[:blast_xml_file] = input_file + '.blast_xml'
32
+ log_file = File.join(dirs[:tmp_dir], 'blast_cmd_output.txt')
33
+ `#{blast_cmd(opt, config, remote)} > #{log_file} 2>&1`
59
34
 
60
- blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
61
- # -num_threads is not supported on remote databases
62
- threads = (opt[:db] !~ /remote/) ? "-num_threads #{num_threads}" : ''
63
-
64
- blastcmd = "#{blast_type} -query '#{input_file}'" \
65
- " -out '#{opt[:blast_xml_file]}' -db #{db} " \
66
- " -evalue #{EVALUE} -outfmt 5 #{threads}"
67
-
68
- `#{blastcmd} >/dev/null 2>&1`
69
35
  return unless File.zero?(opt[:blast_xml_file])
70
- $stderr.puts 'Blast failed to run on the input file.'
71
- if opt[:db] !~ /remote/
72
- $stderr.puts 'Please ensure that the BLAST database exists and try'
73
- $stderr.puts 'again.'
36
+ warn 'Blast failed to run on the input file.'
37
+ if remote
38
+ warn 'You are using BLAST with a remote database. Please'
39
+ warn 'ensure that you have internet access and try again.'
74
40
  else
75
- $stderr.puts 'You are using BLAST with a remote database. Please'
76
- $stderr.puts 'ensure that you have internet access and try again.'
41
+ warn 'Please ensure that the BLAST database exists and try again.'
77
42
  end
43
+ exit 1
78
44
  end
79
45
 
80
46
  ##
@@ -84,66 +50,24 @@ module GeneValidator
84
50
  # +type+: the type of the sequence: :nucleotide or :protein
85
51
  # Outputs:
86
52
  # Array of +Sequence+ objects corresponding to the list of hits
87
- def parse_next(iterator, type = config[:type])
88
- hits = []
53
+ def parse_next(iterator)
89
54
  iter = iterator.next
90
55
 
91
56
  # parse blast the xml output and get the hits
92
57
  # hits obtained are proteins! (we use only blastp and blastx)
58
+ hits = []
93
59
  iter.each do |hit|
94
- seq = Query.new
95
-
60
+ seq = Query.new
96
61
  seq.length_protein = hit.len.to_i
97
62
  seq.type = :protein
98
63
  seq.identifier = hit.hit_id
99
64
  seq.definition = hit.hit_def
100
- seq.accession_no = hit.accession
101
-
102
- # get all high-scoring segment pairs (hsp)
103
- hsps = []
104
-
105
- hit.hsps.each do |hsp|
106
- current_hsp = Hsp.new
107
- current_hsp.hsp_evalue = format('%.0e', hsp.evalue)
65
+ seq.accession_no = hit.accession
66
+ seq.hsp_list = hit.hsps.map { |hsp| Hsp.new(xml_input: hsp) }
108
67
 
109
- current_hsp.hit_from = hsp.hit_from.to_i
110
- current_hsp.hit_to = hsp.hit_to.to_i
111
- current_hsp.match_query_from = hsp.query_from.to_i
112
- current_hsp.match_query_to = hsp.query_to.to_i
113
-
114
- if type == :nucleotide
115
- current_hsp.match_query_from /= 3
116
- current_hsp.match_query_to /= 3
117
- current_hsp.match_query_from += 1
118
- current_hsp.match_query_to += 1
119
- end
120
-
121
- current_hsp.query_reading_frame = hsp.query_frame.to_i
122
-
123
- current_hsp.hit_alignment = hsp.hseq.to_s
124
- seq_type = guess_sequence_type(current_hsp.hit_alignment)
125
- fail SequenceTypeError unless seq_type == :protein || seq_type.nil?
126
-
127
- current_hsp.query_alignment = hsp.qseq.to_s
128
- seq_type = guess_sequence_type(current_hsp.query_alignment)
129
- fail SequenceTypeError unless seq_type == :protein || seq_type.nil?
130
-
131
- current_hsp.align_len = hsp.align_len.to_i
132
- current_hsp.identity = hsp.identity.to_i
133
- current_hsp.pidentity = (100 * hsp.identity / hsp.align_len.to_f)
134
- .round(2)
135
-
136
- hsps.push(current_hsp)
137
- end
138
-
139
- seq.hsp_list = hsps
140
- hits.push(seq)
68
+ hits << seq
141
69
  end
142
-
143
70
  hits
144
- rescue SequenceTypeError => e
145
- $stderr.puts e
146
- exit 1
147
71
  rescue StopIteration
148
72
  nil
149
73
  end
@@ -164,7 +88,7 @@ module GeneValidator
164
88
  sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?)
165
89
  # get all sequence types
166
90
  sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }
167
- .uniq.compact
91
+ .uniq.compact
168
92
 
169
93
  return nil if sequence_types.empty?
170
94
  sequence_types.first if sequence_types.length == 1
@@ -184,7 +108,7 @@ module GeneValidator
184
108
  return nil if cleaned_sequence.length < 10 # conservative
185
109
 
186
110
  type = Bio::Sequence.new(cleaned_sequence).guess(0.9)
187
- (type == Bio::Sequence::NA) ? :nucleotide : :protein
111
+ type == Bio::Sequence::NA ? :nucleotide : :protein
188
112
  end
189
113
 
190
114
  ##
@@ -192,11 +116,32 @@ module GeneValidator
192
116
  def guess_sequence_type_from_input_file(file = opt[:input_fasta_file])
193
117
  lines = File.foreach(file).first(10)
194
118
  seqs = ''
195
- lines.each do |l|
196
- seqs += l.chomp unless l[0] == '>'
197
- end
119
+ lines.each { |l| seqs += l.chomp unless l[0] == '>' }
198
120
  guess_sequence_type(seqs)
199
121
  end
122
+
123
+ private
124
+
125
+ def blast_cmd(opt, config, remote)
126
+ blast_type = config[:type] == :protein ? 'blastp' : 'blastx'
127
+ # -num_threads is not supported on remote databases
128
+ threads = remote ? '' : "-num_threads #{opt[:num_threads]}"
129
+
130
+ "#{blast_type} -query '#{opt[:input_fasta_file]}'" \
131
+ " -db #{opt[:db]} -outfmt 5 -evalue #{EVALUE} #{threads}" \
132
+ " -out '#{opt[:blast_xml_file]}' #{opt[:blast_options]}"
133
+ end
134
+
135
+ def print_blast_info_text(remote)
136
+ warn '' # a blank line
137
+ if remote
138
+ warn '==> BLAST search and subsequent analysis will be done on a remote'
139
+ warn ' database. Please use a local database for larger analysis.'
140
+ else
141
+ warn '==> Running BLAST. This may take a while.'
142
+ end
143
+ warn '' # a blank line
144
+ end
200
145
  end
201
146
  end
202
147
  end
@@ -13,7 +13,7 @@ module GeneValidator
13
13
  end
14
14
 
15
15
  def print
16
- $stderr.puts "Cluster: #{x} #{y}"
16
+ warn "Cluster: #{x} #{y}"
17
17
  end
18
18
 
19
19
  ##
@@ -44,7 +44,7 @@ module GeneValidator
44
44
  # Overload quality operator
45
45
  # Returns true if the pairs are equal, false otherwise
46
46
  def ==(other)
47
- (other.x == x && other.y == y) ? true : false
47
+ other.x == x && other.y == y ? true : false
48
48
  end
49
49
 
50
50
  def eql?(other)
@@ -66,7 +66,7 @@ module GeneValidator
66
66
 
67
67
  def print
68
68
  objects.each do |elem|
69
- $stderr.puts "(#{elem[0].x},#{elem[0].y}): #{elem[1]}"
69
+ warn "(#{elem[0].x},#{elem[0].y}): #{elem[1]}"
70
70
  end
71
71
  end
72
72
 
@@ -254,7 +254,7 @@ module GeneValidator
254
254
  # Real number
255
255
  def deviation(clusters, queryLength)
256
256
  hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten
257
- raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.gsub('[', '').gsub(']', '')
257
+ raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.delete('[').delete(']')
258
258
  R.eval("sd = sd(c(#{raw_hits}))")
259
259
  sd = R.pull('sd')
260
260
  sd = standard_deviation(hits)
@@ -273,11 +273,11 @@ module GeneValidator
273
273
  ##
274
274
  # Prints the current cluster
275
275
  def print
276
- $stderr.puts "Cluster: mean = #{mean}, density = #{density}"
276
+ warn "Cluster: mean = #{mean}, density = #{density}"
277
277
  lengths.sort { |a, b| a <=> b }.each do |elem|
278
- $stderr.puts "#{elem[0]}, #{elem[1]}"
278
+ warn "#{elem[0]}, #{elem[1]}"
279
279
  end
280
- $stderr.puts '--------------------------'
280
+ warn '--------------------------'
281
281
  end
282
282
 
283
283
  ##
@@ -337,7 +337,7 @@ module GeneValidator
337
337
  # clusters = array of clusters
338
338
  # initially each length belongs to a different cluster
339
339
  histogram.each do |e|
340
- $stderr.puts "pair (#{e[0].x} #{e[0].y}) appears #{e[1]} times" if debug
340
+ warn "pair (#{e[0].x} #{e[0].y}) appears #{e[1]} times" if debug
341
341
  hash = { e[0] => e[1] }
342
342
  cluster = PairCluster.new(hash)
343
343
  clusters.push(cluster)
@@ -355,7 +355,7 @@ module GeneValidator
355
355
  break if no_clusters != 0 && clusters.length == no_clusters
356
356
 
357
357
  iteration += iteration
358
- $stderr.puts "\nIteration #{iteration}" if debug
358
+ warn "\nIteration #{iteration}" if debug
359
359
 
360
360
  min_distance = 100_000_000
361
361
  cluster1 = 0
@@ -365,9 +365,7 @@ module GeneValidator
365
365
  [*(0..(clusters.length - 2))].each do |i|
366
366
  [*((i + 1)..(clusters.length - 1))].each do |j|
367
367
  dist = clusters[i].distance(clusters[j], distance_method)
368
- if debug
369
- $stderr.puts "distance between clusters #{i} and #{j} is #{dist}"
370
- end
368
+ warn "distance between clusters #{i} and #{j} is #{dist}" if debug
371
369
  current_density = clusters[i].density + clusters[j].density
372
370
  if dist < min_distance
373
371
  min_distance = dist
@@ -383,14 +381,14 @@ module GeneValidator
383
381
  end
384
382
 
385
383
  # merge clusters 'cluster1' and 'cluster2'
386
- $stderr.puts "clusters to merge #{cluster1} and #{cluster2}" if debug
384
+ warn "clusters to merge #{cluster1} and #{cluster2}" if debug
387
385
 
388
386
  clusters[cluster1].add(clusters[cluster2])
389
387
  clusters.delete_at(cluster2)
390
388
 
391
389
  if debug
392
390
  clusters.each_with_index do |elem, i|
393
- $stderr.puts "cluster #{i}"
391
+ warn "cluster #{i}"
394
392
  elem.print
395
393
  end
396
394
  end
@@ -437,8 +435,8 @@ module GeneValidator
437
435
 
438
436
  # clusters = array of clusters
439
437
  # initially each length belongs to a different cluster
440
- histogram.sort { |a, b| a[0] <=> b[0] }.each do |elem|
441
- $stderr.puts "len #{elem[0]} appears #{elem[1]} times" if debug
438
+ histogram.sort_by { |a| a[0] }.each do |elem|
439
+ warn "len #{elem[0]} appears #{elem[1]} times" if debug
442
440
  hash = { elem[0] => elem[1] }
443
441
  cluster = Cluster.new(hash)
444
442
  clusters.push(cluster)
@@ -456,7 +454,7 @@ module GeneValidator
456
454
  break if no_clusters != 0 && clusters.length == no_clusters
457
455
 
458
456
  iteration += iteration
459
- $stderr.puts "\nIteration #{iteration}" if debug
457
+ warn "\nIteration #{iteration}" if debug
460
458
 
461
459
  min_distance = 100_000_000
462
460
  cluster = 0
@@ -464,9 +462,7 @@ module GeneValidator
464
462
 
465
463
  clusters[0..clusters.length - 2].each_with_index do |_item, i|
466
464
  dist = clusters[i].distance(clusters[i + 1], distance_method)
467
- if debug
468
- $stderr.puts "distance btwn clusters #{i} and #{i + 1} is #{dist}"
469
- end
465
+ warn "distance btwn clusters #{i} and #{i + 1} is #{dist}" if debug
470
466
  current_density = clusters[i].density + clusters[i + 1].density
471
467
  if dist < min_distance
472
468
  min_distance = dist
@@ -485,14 +481,14 @@ module GeneValidator
485
481
  end
486
482
 
487
483
  # merge clusters 'cluster' and 'cluster'+1
488
- $stderr.puts "clusters to merge #{cluster} and #{cluster + 1}" if debug
484
+ warn "clusters to merge #{cluster} and #{cluster + 1}" if debug
489
485
 
490
486
  clusters[cluster].add(clusters[cluster + 1])
491
487
  clusters.delete_at(cluster + 1)
492
488
 
493
489
  if debug
494
490
  clusters.each_with_index do |elem, i|
495
- $stderr.puts "cluster #{i}"
491
+ warn "cluster #{i}"
496
492
  elem.print
497
493
  end
498
494
  end