genevalidator 1.6.12 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +30 -1
  3. data/.ruby-version +1 -0
  4. data/.travis.yml +13 -12
  5. data/Gemfile +4 -1
  6. data/Gemfile.lock +135 -0
  7. data/README.md +104 -122
  8. data/Rakefile +377 -5
  9. data/aux/gv_results.slim +155 -0
  10. data/aux/html_files/css/gv.compiled.min.css +8 -0
  11. data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
  12. data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
  13. data/aux/{files → html_files}/css/src/style.css +0 -0
  14. data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
  15. data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
  16. data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
  17. data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
  18. data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
  19. data/aux/{files → html_files}/img/gene.png +0 -0
  20. data/aux/html_files/js/gv.compiled.min.js +1 -0
  21. data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
  22. data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
  23. data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
  24. data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
  25. data/aux/{files → html_files}/js/src/plots.js +1 -1
  26. data/aux/{files → html_files}/js/src/script.js +0 -0
  27. data/aux/{files → html_files}/json/.gitkeep +0 -0
  28. data/bin/genevalidator +393 -56
  29. data/exemplar_data/README.md +60 -0
  30. data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
  31. data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
  32. data/genevalidator.gemspec +35 -20
  33. data/install.sh +92 -0
  34. data/lib/genevalidator.rb +171 -56
  35. data/lib/genevalidator/arg_validation.rb +26 -55
  36. data/lib/genevalidator/blast.rb +44 -99
  37. data/lib/genevalidator/clusterization.rb +18 -22
  38. data/lib/genevalidator/exceptions.rb +17 -17
  39. data/lib/genevalidator/ext/array.rb +21 -4
  40. data/lib/genevalidator/get_raw_sequences.rb +32 -31
  41. data/lib/genevalidator/hsp.rb +31 -2
  42. data/lib/genevalidator/json_to_gv_results.rb +38 -122
  43. data/lib/genevalidator/output.rb +158 -172
  44. data/lib/genevalidator/output_files.rb +134 -0
  45. data/lib/genevalidator/pool.rb +2 -5
  46. data/lib/genevalidator/query.rb +1 -1
  47. data/lib/genevalidator/tabular_parser.rb +8 -29
  48. data/lib/genevalidator/validation.rb +48 -90
  49. data/lib/genevalidator/validation_alignment.rb +64 -75
  50. data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
  51. data/lib/genevalidator/validation_duplication.rb +85 -84
  52. data/lib/genevalidator/validation_gene_merge.rb +46 -35
  53. data/lib/genevalidator/validation_length_cluster.rb +18 -15
  54. data/lib/genevalidator/validation_length_rank.rb +19 -15
  55. data/lib/genevalidator/validation_maker_qi.rb +13 -12
  56. data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
  57. data/lib/genevalidator/validation_report.rb +1 -1
  58. data/lib/genevalidator/validation_test.rb +1 -1
  59. data/lib/genevalidator/version.rb +1 -1
  60. data/test/overall.rb +1 -1
  61. data/test/test_all_validations.rb +36 -24
  62. data/test/test_blast.rb +39 -24
  63. data/test/test_clusterization_2d.rb +4 -4
  64. data/test/test_helper.rb +2 -2
  65. data/test/test_query.rb +16 -20
  66. data/test/test_validation_open_reading_frame.rb +122 -122
  67. data/test/test_validations.rb +12 -10
  68. metadata +94 -79
  69. data/aux/files/css/genevalidator.compiled.min.css +0 -16
  70. data/aux/files/js/genevalidator.compiled.min.js +0 -28
  71. data/aux/json_footer.erb +0 -8
  72. data/aux/json_header.erb +0 -19
  73. data/aux/json_query.erb +0 -15
  74. data/aux/template_footer.erb +0 -8
  75. data/aux/template_header.erb +0 -19
  76. data/aux/template_query.erb +0 -14
  77. data/data/README.md +0 -57
  78. data/data/mrna_data.fasta.blast_tabular +0 -3567
  79. data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
  80. data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
  81. data/data/mrna_data.fasta.blast_xml +0 -39800
  82. data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
  83. data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
  84. data/data/mrna_data.fasta.json +0 -1
  85. data/data/protein_data.fasta.blast_tabular +0 -3278
  86. data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
  87. data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
  88. data/data/protein_data.fasta.blast_xml +0 -26228
  89. data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
  90. data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
  91. data/data/protein_data.fasta.json +0 -1
@@ -16,7 +16,6 @@ module GeneValidator
16
16
 
17
17
  def validate_args
18
18
  @opt = opt
19
- assert_output_dir_does_not_exist
20
19
  assert_file_present('input file', opt[:input_fasta_file])
21
20
  assert_input_file_probably_fasta
22
21
  assert_input_sequence
@@ -38,7 +37,7 @@ module GeneValidator
38
37
  private
39
38
 
40
39
  def assert_validations_arg
41
- validations = %w(lenc lenr frame merge dup orf align)
40
+ validations = %w[lenc lenr frame merge dup orf align]
42
41
  if @opt[:validations]
43
42
  val = @opt[:validations].collect { |v| v.strip.downcase }
44
43
  validations = val unless val.include? 'all'
@@ -48,13 +47,13 @@ module GeneValidator
48
47
 
49
48
  def check_num_threads
50
49
  @opt[:num_threads] = Integer(@opt[:num_threads])
51
- unless @opt[:num_threads] > 0
52
- $stderr.puts 'Number of threads can not be lower than 0'
53
- $stderr.puts 'Setting number of threads to 1'
50
+ unless @opt[:num_threads].positive?
51
+ warn 'Number of threads can not be lower than 0'
52
+ warn 'Setting number of threads to 1'
54
53
  @opt[:num_threads] = 1
55
54
  end
56
55
  return unless @opt[:num_threads] > 256
57
- $stderr.puts "Number of threads set at #{@opt[:num_threads]} is" \
56
+ warn "Number of threads set at #{@opt[:num_threads]} is" \
58
57
  ' unusually high.'
59
58
  end
60
59
 
@@ -68,45 +67,35 @@ module GeneValidator
68
67
  end
69
68
  end
70
69
 
71
- def assert_output_dir_does_not_exist
72
- output_dir = "#{@opt[:input_fasta_file]}.html"
73
- return unless File.exist?(output_dir)
74
- $stderr.puts 'The output directory already exists for this fasta file.'
75
- $stderr.puts "\nPlease remove the following directory: #{output_dir}\n"
76
- $stderr.puts "You can run the following command to remove the folder.\n"
77
- $stderr.puts "\n $ rm -r #{output_dir} \n"
78
- exit 1
79
- end
80
-
81
70
  def assert_tabular_options_exists
82
71
  return if @opt[:blast_tabular_options]
83
- $stderr.puts '*** Error: BLAST tabular options (-o) have not been set.'
84
- $stderr.puts ' Please set the "-o" option with the custom format'
85
- $stderr.puts ' used in the BLAST -outfmt argument'
72
+ warn '*** Error: BLAST tabular options (-o) have not been set.'
73
+ warn ' Please set the "-o" option with the custom format'
74
+ warn ' used in the BLAST -outfmt argument'
86
75
  exit 1
87
76
  end
88
77
 
89
78
  def assert_input_file_probably_fasta
90
79
  File.open(@opt[:input_fasta_file], 'r') do |file_stream|
91
- (file_stream.readline[0] == '>') ? true : false
80
+ file_stream.readline[0] == '>'
92
81
  end
93
82
  end
94
83
 
95
84
  def assert_file_present(desc, file, exit_code = 1)
96
85
  return if file && File.exist?(File.expand_path(file))
97
- $stderr.puts "*** Error: Couldn't find the #{desc}: #{file}."
86
+ warn "*** Error: Couldn't find the #{desc}: #{file}."
98
87
  exit exit_code
99
88
  end
100
89
 
101
- alias_method :assert_dir_present, :assert_file_present
90
+ alias assert_dir_present assert_file_present
102
91
 
103
92
  def assert_input_sequence
104
93
  fasta_content = IO.binread(@opt[:input_fasta_file])
105
94
  type = BlastUtils.type_of_sequences(fasta_content)
106
- return if type == :nucleotide || type == :protein
107
- $stderr.puts '*** Error: The input files does not contain just protein'
108
- $stderr.puts ' or nucleotide data.'
109
- $stderr.puts ' Please correct this and try again.'
95
+ return if %i[nucleotide protein].include? type
96
+ warn '*** Error: The input files does not contain just protein'
97
+ warn ' or nucleotide data.'
98
+ warn ' Please correct this and try again.'
110
99
  exit 1
111
100
  end
112
101
 
@@ -116,8 +105,8 @@ module GeneValidator
116
105
  if File.exist?(bin) && File.directory?(bin)
117
106
  add_to_path(bin)
118
107
  else
119
- $stderr.puts '*** The following bin directory does not exist:'
120
- $stderr.puts " #{bin}"
108
+ warn '*** The following bin directory does not exist:'
109
+ warn " #{bin}"
121
110
  end
122
111
  end
123
112
  end
@@ -131,10 +120,10 @@ module GeneValidator
131
120
 
132
121
  def assert_mafft_installation
133
122
  return if command?('mafft')
134
- $stderr.puts '*** Could not find Mafft binaries.'
135
- $stderr.puts ' Ignoring error and continuing - Please note that' \
123
+ warn '*** Could not find Mafft binaries.'
124
+ warn ' Ignoring error and continuing - Please note that' \
136
125
  ' some validations may be skipped.'
137
- $stderr.puts # a blank line
126
+ warn # a blank line
138
127
  end
139
128
  end
140
129
 
@@ -142,7 +131,7 @@ module GeneValidator
142
131
  class Blast
143
132
  class << self
144
133
  # Use a fixed minimum version of BLAST+
145
- MINIMUM_BLAST_VERSION = '2.2.30+'
134
+ MINIMUM_BLAST_VERSION = '2.2.30+'.freeze
146
135
  # Use the following exit codes, or 1.
147
136
  EXIT_BLAST_NOT_INSTALLED = 2
148
137
  EXIT_BLAST_NOT_COMPATIBLE = 3
@@ -150,7 +139,6 @@ module GeneValidator
150
139
 
151
140
  def validate(opt)
152
141
  assert_blast_installation
153
- warn_if_remote_database(opt)
154
142
  assert_local_blast_database_exists(opt[:db]) if opt[:db] !~ /remote/
155
143
  end
156
144
 
@@ -160,27 +148,10 @@ module GeneValidator
160
148
  assert_blast_compatible
161
149
  end
162
150
 
163
- def warn_if_remote_database(opt)
164
- return if opt[:db] !~ /remote/
165
- $stderr.puts # a blank line
166
- if !opt[:raw_sequences] &&
167
- (opt[:validations].include?('align') ||
168
- opt[:validations].include?('dup'))
169
- $stderr.puts 'Warning: Hit sequences will be fetched from remote' \
170
- ' server.'
171
- else
172
- $stderr.puts 'Warning: BLAST will be carried out on remote server.'
173
- end
174
- $stderr.puts 'This may take quite a bit of time.'
175
- $stderr.puts 'You may want to install a local BLAST database for' \
176
- ' faster analyses.'
177
- $stderr.puts # a blank line
178
- end
179
-
180
151
  def assert_local_blast_database_exists(db)
181
152
  return if system("blastdbcmd -db #{db} -info > /dev/null 2>&1")
182
- $stderr.puts '*** No BLAST database found at the provided path.'
183
- $stderr.puts ' Please ensure that the provided path is correct' \
153
+ warn '*** No BLAST database found at the provided path.'
154
+ warn ' Please ensure that the provided path is correct' \
184
155
  ' and then try again.'
185
156
  exit EXIT_NO_BLAST_DATABASE
186
157
  end
@@ -189,15 +160,15 @@ module GeneValidator
189
160
 
190
161
  def assert_blast_installed
191
162
  return if GVArgValidation.command?('blastdbcmd')
192
- $stderr.puts '*** Could not find BLAST+ binaries.'
163
+ warn '*** Could not find BLAST+ binaries.'
193
164
  exit EXIT_BLAST_NOT_INSTALLED
194
165
  end
195
166
 
196
167
  def assert_blast_compatible
197
168
  version = `blastdbcmd -version`.split[1]
198
169
  return if version >= MINIMUM_BLAST_VERSION
199
- $stderr.puts "*** Your BLAST+ version #{version} is outdated."
200
- $stderr.puts ' GeneValidator needs NCBI BLAST+ version' \
170
+ warn "*** Your BLAST+ version #{version} is outdated."
171
+ warn ' GeneValidator needs NCBI BLAST+ version' \
201
172
  " #{MINIMUM_BLAST_VERSION} or higher."
202
173
  exit EXIT_BLAST_NOT_COMPATIBLE
203
174
  end
@@ -12,34 +12,10 @@ module GeneValidator
12
12
  class BlastUtils
13
13
  class << self
14
14
  extend Forwardable
15
- def_delegators GeneValidator, :opt, :config
15
+ def_delegators GeneValidator, :opt, :config, :dirs
16
16
 
17
17
  EVALUE = 1e-5
18
18
 
19
- ##
20
- # Calls blast from standard input with specific parameters
21
- # Params:
22
- # +blast_type+: blast command in String format (e.g 'blast(x/p)')
23
- # +query+: String containing the the query in fasta format
24
- # +db+: database
25
- # +num_threads+: The number of threads to run BLAST with.
26
- # Output:
27
- # String with the blast xml output
28
- def run_blast(query, db = opt[:db], seq_type = config[:type],
29
- num_threads = opt[:num_threads])
30
-
31
- blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
32
- # -num_threads is not supported on remote databases
33
- threads = (db !~ /remote/) ? "-num_threads #{num_threads}" : ''
34
-
35
- blastcmd = "#{blast_type} -db '#{db}' -evalue #{EVALUE} -outfmt 5" \
36
- " #{threads}"
37
-
38
- cmd = "echo \"#{query}\" | #{blastcmd}"
39
- `#{cmd} >/dev/null 2>&1`
40
- end
41
-
42
- ##
43
19
  # Runs BLAST on an input file
44
20
  # Params:
45
21
  # +blast_type+: blast command in String format (e.g 'blastx' or 'blastp')
@@ -49,32 +25,22 @@ module GeneValidator
49
25
  # +nr_hits+: max number of hits
50
26
  # Output:
51
27
  # XML file
52
- def run_blast_on_input_file(input_file = opt[:input_fasta_file],
53
- db = opt[:db], seq_type = config[:type],
54
- num_threads = opt[:num_threads])
55
- return if opt[:blast_xml_file] || opt[:blast_tabular_file]
28
+ def run_blast_on_input_file
29
+ remote = opt[:db].match?(/remote/) ? true : false
30
+ print_blast_info_text(remote)
56
31
 
57
- $stderr.puts 'Running BLAST. This may take a while.'
58
- opt[:blast_xml_file] = input_file + '.blast_xml'
32
+ log_file = File.join(dirs[:tmp_dir], 'blast_cmd_output.txt')
33
+ `#{blast_cmd(opt, config, remote)} > #{log_file} 2>&1`
59
34
 
60
- blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
61
- # -num_threads is not supported on remote databases
62
- threads = (opt[:db] !~ /remote/) ? "-num_threads #{num_threads}" : ''
63
-
64
- blastcmd = "#{blast_type} -query '#{input_file}'" \
65
- " -out '#{opt[:blast_xml_file]}' -db #{db} " \
66
- " -evalue #{EVALUE} -outfmt 5 #{threads}"
67
-
68
- `#{blastcmd} >/dev/null 2>&1`
69
35
  return unless File.zero?(opt[:blast_xml_file])
70
- $stderr.puts 'Blast failed to run on the input file.'
71
- if opt[:db] !~ /remote/
72
- $stderr.puts 'Please ensure that the BLAST database exists and try'
73
- $stderr.puts 'again.'
36
+ warn 'Blast failed to run on the input file.'
37
+ if remote
38
+ warn 'You are using BLAST with a remote database. Please'
39
+ warn 'ensure that you have internet access and try again.'
74
40
  else
75
- $stderr.puts 'You are using BLAST with a remote database. Please'
76
- $stderr.puts 'ensure that you have internet access and try again.'
41
+ warn 'Please ensure that the BLAST database exists and try again.'
77
42
  end
43
+ exit 1
78
44
  end
79
45
 
80
46
  ##
@@ -84,66 +50,24 @@ module GeneValidator
84
50
  # +type+: the type of the sequence: :nucleotide or :protein
85
51
  # Outputs:
86
52
  # Array of +Sequence+ objects corresponding to the list of hits
87
- def parse_next(iterator, type = config[:type])
88
- hits = []
53
+ def parse_next(iterator)
89
54
  iter = iterator.next
90
55
 
91
56
  # parse blast the xml output and get the hits
92
57
  # hits obtained are proteins! (we use only blastp and blastx)
58
+ hits = []
93
59
  iter.each do |hit|
94
- seq = Query.new
95
-
60
+ seq = Query.new
96
61
  seq.length_protein = hit.len.to_i
97
62
  seq.type = :protein
98
63
  seq.identifier = hit.hit_id
99
64
  seq.definition = hit.hit_def
100
- seq.accession_no = hit.accession
101
-
102
- # get all high-scoring segment pairs (hsp)
103
- hsps = []
104
-
105
- hit.hsps.each do |hsp|
106
- current_hsp = Hsp.new
107
- current_hsp.hsp_evalue = format('%.0e', hsp.evalue)
65
+ seq.accession_no = hit.accession
66
+ seq.hsp_list = hit.hsps.map { |hsp| Hsp.new(xml_input: hsp) }
108
67
 
109
- current_hsp.hit_from = hsp.hit_from.to_i
110
- current_hsp.hit_to = hsp.hit_to.to_i
111
- current_hsp.match_query_from = hsp.query_from.to_i
112
- current_hsp.match_query_to = hsp.query_to.to_i
113
-
114
- if type == :nucleotide
115
- current_hsp.match_query_from /= 3
116
- current_hsp.match_query_to /= 3
117
- current_hsp.match_query_from += 1
118
- current_hsp.match_query_to += 1
119
- end
120
-
121
- current_hsp.query_reading_frame = hsp.query_frame.to_i
122
-
123
- current_hsp.hit_alignment = hsp.hseq.to_s
124
- seq_type = guess_sequence_type(current_hsp.hit_alignment)
125
- fail SequenceTypeError unless seq_type == :protein || seq_type.nil?
126
-
127
- current_hsp.query_alignment = hsp.qseq.to_s
128
- seq_type = guess_sequence_type(current_hsp.query_alignment)
129
- fail SequenceTypeError unless seq_type == :protein || seq_type.nil?
130
-
131
- current_hsp.align_len = hsp.align_len.to_i
132
- current_hsp.identity = hsp.identity.to_i
133
- current_hsp.pidentity = (100 * hsp.identity / hsp.align_len.to_f)
134
- .round(2)
135
-
136
- hsps.push(current_hsp)
137
- end
138
-
139
- seq.hsp_list = hsps
140
- hits.push(seq)
68
+ hits << seq
141
69
  end
142
-
143
70
  hits
144
- rescue SequenceTypeError => e
145
- $stderr.puts e
146
- exit 1
147
71
  rescue StopIteration
148
72
  nil
149
73
  end
@@ -164,7 +88,7 @@ module GeneValidator
164
88
  sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?)
165
89
  # get all sequence types
166
90
  sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }
167
- .uniq.compact
91
+ .uniq.compact
168
92
 
169
93
  return nil if sequence_types.empty?
170
94
  sequence_types.first if sequence_types.length == 1
@@ -184,7 +108,7 @@ module GeneValidator
184
108
  return nil if cleaned_sequence.length < 10 # conservative
185
109
 
186
110
  type = Bio::Sequence.new(cleaned_sequence).guess(0.9)
187
- (type == Bio::Sequence::NA) ? :nucleotide : :protein
111
+ type == Bio::Sequence::NA ? :nucleotide : :protein
188
112
  end
189
113
 
190
114
  ##
@@ -192,11 +116,32 @@ module GeneValidator
192
116
  def guess_sequence_type_from_input_file(file = opt[:input_fasta_file])
193
117
  lines = File.foreach(file).first(10)
194
118
  seqs = ''
195
- lines.each do |l|
196
- seqs += l.chomp unless l[0] == '>'
197
- end
119
+ lines.each { |l| seqs += l.chomp unless l[0] == '>' }
198
120
  guess_sequence_type(seqs)
199
121
  end
122
+
123
+ private
124
+
125
+ def blast_cmd(opt, config, remote)
126
+ blast_type = config[:type] == :protein ? 'blastp' : 'blastx'
127
+ # -num_threads is not supported on remote databases
128
+ threads = remote ? '' : "-num_threads #{opt[:num_threads]}"
129
+
130
+ "#{blast_type} -query '#{opt[:input_fasta_file]}'" \
131
+ " -db #{opt[:db]} -outfmt 5 -evalue #{EVALUE} #{threads}" \
132
+ " -out '#{opt[:blast_xml_file]}' #{opt[:blast_options]}"
133
+ end
134
+
135
+ def print_blast_info_text(remote)
136
+ warn '' # a blank line
137
+ if remote
138
+ warn '==> BLAST search and subsequent analysis will be done on a remote'
139
+ warn ' database. Please use a local database for larger analysis.'
140
+ else
141
+ warn '==> Running BLAST. This may take a while.'
142
+ end
143
+ warn '' # a blank line
144
+ end
200
145
  end
201
146
  end
202
147
  end
@@ -13,7 +13,7 @@ module GeneValidator
13
13
  end
14
14
 
15
15
  def print
16
- $stderr.puts "Cluster: #{x} #{y}"
16
+ warn "Cluster: #{x} #{y}"
17
17
  end
18
18
 
19
19
  ##
@@ -44,7 +44,7 @@ module GeneValidator
44
44
  # Overload quality operator
45
45
  # Returns true if the pairs are equal, false otherwise
46
46
  def ==(other)
47
- (other.x == x && other.y == y) ? true : false
47
+ other.x == x && other.y == y ? true : false
48
48
  end
49
49
 
50
50
  def eql?(other)
@@ -66,7 +66,7 @@ module GeneValidator
66
66
 
67
67
  def print
68
68
  objects.each do |elem|
69
- $stderr.puts "(#{elem[0].x},#{elem[0].y}): #{elem[1]}"
69
+ warn "(#{elem[0].x},#{elem[0].y}): #{elem[1]}"
70
70
  end
71
71
  end
72
72
 
@@ -254,7 +254,7 @@ module GeneValidator
254
254
  # Real number
255
255
  def deviation(clusters, queryLength)
256
256
  hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten
257
- raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.gsub('[', '').gsub(']', '')
257
+ raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.delete('[').delete(']')
258
258
  R.eval("sd = sd(c(#{raw_hits}))")
259
259
  sd = R.pull('sd')
260
260
  sd = standard_deviation(hits)
@@ -273,11 +273,11 @@ module GeneValidator
273
273
  ##
274
274
  # Prints the current cluster
275
275
  def print
276
- $stderr.puts "Cluster: mean = #{mean}, density = #{density}"
276
+ warn "Cluster: mean = #{mean}, density = #{density}"
277
277
  lengths.sort { |a, b| a <=> b }.each do |elem|
278
- $stderr.puts "#{elem[0]}, #{elem[1]}"
278
+ warn "#{elem[0]}, #{elem[1]}"
279
279
  end
280
- $stderr.puts '--------------------------'
280
+ warn '--------------------------'
281
281
  end
282
282
 
283
283
  ##
@@ -337,7 +337,7 @@ module GeneValidator
337
337
  # clusters = array of clusters
338
338
  # initially each length belongs to a different cluster
339
339
  histogram.each do |e|
340
- $stderr.puts "pair (#{e[0].x} #{e[0].y}) appears #{e[1]} times" if debug
340
+ warn "pair (#{e[0].x} #{e[0].y}) appears #{e[1]} times" if debug
341
341
  hash = { e[0] => e[1] }
342
342
  cluster = PairCluster.new(hash)
343
343
  clusters.push(cluster)
@@ -355,7 +355,7 @@ module GeneValidator
355
355
  break if no_clusters != 0 && clusters.length == no_clusters
356
356
 
357
357
  iteration += iteration
358
- $stderr.puts "\nIteration #{iteration}" if debug
358
+ warn "\nIteration #{iteration}" if debug
359
359
 
360
360
  min_distance = 100_000_000
361
361
  cluster1 = 0
@@ -365,9 +365,7 @@ module GeneValidator
365
365
  [*(0..(clusters.length - 2))].each do |i|
366
366
  [*((i + 1)..(clusters.length - 1))].each do |j|
367
367
  dist = clusters[i].distance(clusters[j], distance_method)
368
- if debug
369
- $stderr.puts "distance between clusters #{i} and #{j} is #{dist}"
370
- end
368
+ warn "distance between clusters #{i} and #{j} is #{dist}" if debug
371
369
  current_density = clusters[i].density + clusters[j].density
372
370
  if dist < min_distance
373
371
  min_distance = dist
@@ -383,14 +381,14 @@ module GeneValidator
383
381
  end
384
382
 
385
383
  # merge clusters 'cluster1' and 'cluster2'
386
- $stderr.puts "clusters to merge #{cluster1} and #{cluster2}" if debug
384
+ warn "clusters to merge #{cluster1} and #{cluster2}" if debug
387
385
 
388
386
  clusters[cluster1].add(clusters[cluster2])
389
387
  clusters.delete_at(cluster2)
390
388
 
391
389
  if debug
392
390
  clusters.each_with_index do |elem, i|
393
- $stderr.puts "cluster #{i}"
391
+ warn "cluster #{i}"
394
392
  elem.print
395
393
  end
396
394
  end
@@ -437,8 +435,8 @@ module GeneValidator
437
435
 
438
436
  # clusters = array of clusters
439
437
  # initially each length belongs to a different cluster
440
- histogram.sort { |a, b| a[0] <=> b[0] }.each do |elem|
441
- $stderr.puts "len #{elem[0]} appears #{elem[1]} times" if debug
438
+ histogram.sort_by { |a| a[0] }.each do |elem|
439
+ warn "len #{elem[0]} appears #{elem[1]} times" if debug
442
440
  hash = { elem[0] => elem[1] }
443
441
  cluster = Cluster.new(hash)
444
442
  clusters.push(cluster)
@@ -456,7 +454,7 @@ module GeneValidator
456
454
  break if no_clusters != 0 && clusters.length == no_clusters
457
455
 
458
456
  iteration += iteration
459
- $stderr.puts "\nIteration #{iteration}" if debug
457
+ warn "\nIteration #{iteration}" if debug
460
458
 
461
459
  min_distance = 100_000_000
462
460
  cluster = 0
@@ -464,9 +462,7 @@ module GeneValidator
464
462
 
465
463
  clusters[0..clusters.length - 2].each_with_index do |_item, i|
466
464
  dist = clusters[i].distance(clusters[i + 1], distance_method)
467
- if debug
468
- $stderr.puts "distance btwn clusters #{i} and #{i + 1} is #{dist}"
469
- end
465
+ warn "distance btwn clusters #{i} and #{i + 1} is #{dist}" if debug
470
466
  current_density = clusters[i].density + clusters[i + 1].density
471
467
  if dist < min_distance
472
468
  min_distance = dist
@@ -485,14 +481,14 @@ module GeneValidator
485
481
  end
486
482
 
487
483
  # merge clusters 'cluster' and 'cluster'+1
488
- $stderr.puts "clusters to merge #{cluster} and #{cluster + 1}" if debug
484
+ warn "clusters to merge #{cluster} and #{cluster + 1}" if debug
489
485
 
490
486
  clusters[cluster].add(clusters[cluster + 1])
491
487
  clusters.delete_at(cluster + 1)
492
488
 
493
489
  if debug
494
490
  clusters.each_with_index do |elem, i|
495
- $stderr.puts "cluster #{i}"
491
+ warn "cluster #{i}"
496
492
  elem.print
497
493
  end
498
494
  end