genevalidator 1.6.12 → 2.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +30 -1
- data/.ruby-version +1 -0
- data/.travis.yml +13 -12
- data/Gemfile +4 -1
- data/Gemfile.lock +135 -0
- data/README.md +104 -122
- data/Rakefile +377 -5
- data/aux/gv_results.slim +155 -0
- data/aux/html_files/css/gv.compiled.min.css +8 -0
- data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
- data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
- data/aux/{files → html_files}/css/src/style.css +0 -0
- data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
- data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
- data/aux/{files → html_files}/img/gene.png +0 -0
- data/aux/html_files/js/gv.compiled.min.js +1 -0
- data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
- data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
- data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
- data/aux/{files → html_files}/js/src/plots.js +1 -1
- data/aux/{files → html_files}/js/src/script.js +0 -0
- data/aux/{files → html_files}/json/.gitkeep +0 -0
- data/bin/genevalidator +393 -56
- data/exemplar_data/README.md +60 -0
- data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
- data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
- data/genevalidator.gemspec +35 -20
- data/install.sh +92 -0
- data/lib/genevalidator.rb +171 -56
- data/lib/genevalidator/arg_validation.rb +26 -55
- data/lib/genevalidator/blast.rb +44 -99
- data/lib/genevalidator/clusterization.rb +18 -22
- data/lib/genevalidator/exceptions.rb +17 -17
- data/lib/genevalidator/ext/array.rb +21 -4
- data/lib/genevalidator/get_raw_sequences.rb +32 -31
- data/lib/genevalidator/hsp.rb +31 -2
- data/lib/genevalidator/json_to_gv_results.rb +38 -122
- data/lib/genevalidator/output.rb +158 -172
- data/lib/genevalidator/output_files.rb +134 -0
- data/lib/genevalidator/pool.rb +2 -5
- data/lib/genevalidator/query.rb +1 -1
- data/lib/genevalidator/tabular_parser.rb +8 -29
- data/lib/genevalidator/validation.rb +48 -90
- data/lib/genevalidator/validation_alignment.rb +64 -75
- data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
- data/lib/genevalidator/validation_duplication.rb +85 -84
- data/lib/genevalidator/validation_gene_merge.rb +46 -35
- data/lib/genevalidator/validation_length_cluster.rb +18 -15
- data/lib/genevalidator/validation_length_rank.rb +19 -15
- data/lib/genevalidator/validation_maker_qi.rb +13 -12
- data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
- data/lib/genevalidator/validation_report.rb +1 -1
- data/lib/genevalidator/validation_test.rb +1 -1
- data/lib/genevalidator/version.rb +1 -1
- data/test/overall.rb +1 -1
- data/test/test_all_validations.rb +36 -24
- data/test/test_blast.rb +39 -24
- data/test/test_clusterization_2d.rb +4 -4
- data/test/test_helper.rb +2 -2
- data/test/test_query.rb +16 -20
- data/test/test_validation_open_reading_frame.rb +122 -122
- data/test/test_validations.rb +12 -10
- metadata +94 -79
- data/aux/files/css/genevalidator.compiled.min.css +0 -16
- data/aux/files/js/genevalidator.compiled.min.js +0 -28
- data/aux/json_footer.erb +0 -8
- data/aux/json_header.erb +0 -19
- data/aux/json_query.erb +0 -15
- data/aux/template_footer.erb +0 -8
- data/aux/template_header.erb +0 -19
- data/aux/template_query.erb +0 -14
- data/data/README.md +0 -57
- data/data/mrna_data.fasta.blast_tabular +0 -3567
- data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
- data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
- data/data/mrna_data.fasta.blast_xml +0 -39800
- data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
- data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
- data/data/mrna_data.fasta.json +0 -1
- data/data/protein_data.fasta.blast_tabular +0 -3278
- data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
- data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
- data/data/protein_data.fasta.blast_xml +0 -26228
- data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
- data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
- data/data/protein_data.fasta.json +0 -1
|
@@ -16,7 +16,6 @@ module GeneValidator
|
|
|
16
16
|
|
|
17
17
|
def validate_args
|
|
18
18
|
@opt = opt
|
|
19
|
-
assert_output_dir_does_not_exist
|
|
20
19
|
assert_file_present('input file', opt[:input_fasta_file])
|
|
21
20
|
assert_input_file_probably_fasta
|
|
22
21
|
assert_input_sequence
|
|
@@ -38,7 +37,7 @@ module GeneValidator
|
|
|
38
37
|
private
|
|
39
38
|
|
|
40
39
|
def assert_validations_arg
|
|
41
|
-
validations = %w
|
|
40
|
+
validations = %w[lenc lenr frame merge dup orf align]
|
|
42
41
|
if @opt[:validations]
|
|
43
42
|
val = @opt[:validations].collect { |v| v.strip.downcase }
|
|
44
43
|
validations = val unless val.include? 'all'
|
|
@@ -48,13 +47,13 @@ module GeneValidator
|
|
|
48
47
|
|
|
49
48
|
def check_num_threads
|
|
50
49
|
@opt[:num_threads] = Integer(@opt[:num_threads])
|
|
51
|
-
unless @opt[:num_threads]
|
|
52
|
-
|
|
53
|
-
|
|
50
|
+
unless @opt[:num_threads].positive?
|
|
51
|
+
warn 'Number of threads can not be lower than 0'
|
|
52
|
+
warn 'Setting number of threads to 1'
|
|
54
53
|
@opt[:num_threads] = 1
|
|
55
54
|
end
|
|
56
55
|
return unless @opt[:num_threads] > 256
|
|
57
|
-
|
|
56
|
+
warn "Number of threads set at #{@opt[:num_threads]} is" \
|
|
58
57
|
' unusually high.'
|
|
59
58
|
end
|
|
60
59
|
|
|
@@ -68,45 +67,35 @@ module GeneValidator
|
|
|
68
67
|
end
|
|
69
68
|
end
|
|
70
69
|
|
|
71
|
-
def assert_output_dir_does_not_exist
|
|
72
|
-
output_dir = "#{@opt[:input_fasta_file]}.html"
|
|
73
|
-
return unless File.exist?(output_dir)
|
|
74
|
-
$stderr.puts 'The output directory already exists for this fasta file.'
|
|
75
|
-
$stderr.puts "\nPlease remove the following directory: #{output_dir}\n"
|
|
76
|
-
$stderr.puts "You can run the following command to remove the folder.\n"
|
|
77
|
-
$stderr.puts "\n $ rm -r #{output_dir} \n"
|
|
78
|
-
exit 1
|
|
79
|
-
end
|
|
80
|
-
|
|
81
70
|
def assert_tabular_options_exists
|
|
82
71
|
return if @opt[:blast_tabular_options]
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
72
|
+
warn '*** Error: BLAST tabular options (-o) have not been set.'
|
|
73
|
+
warn ' Please set the "-o" option with the custom format'
|
|
74
|
+
warn ' used in the BLAST -outfmt argument'
|
|
86
75
|
exit 1
|
|
87
76
|
end
|
|
88
77
|
|
|
89
78
|
def assert_input_file_probably_fasta
|
|
90
79
|
File.open(@opt[:input_fasta_file], 'r') do |file_stream|
|
|
91
|
-
|
|
80
|
+
file_stream.readline[0] == '>'
|
|
92
81
|
end
|
|
93
82
|
end
|
|
94
83
|
|
|
95
84
|
def assert_file_present(desc, file, exit_code = 1)
|
|
96
85
|
return if file && File.exist?(File.expand_path(file))
|
|
97
|
-
|
|
86
|
+
warn "*** Error: Couldn't find the #{desc}: #{file}."
|
|
98
87
|
exit exit_code
|
|
99
88
|
end
|
|
100
89
|
|
|
101
|
-
|
|
90
|
+
alias assert_dir_present assert_file_present
|
|
102
91
|
|
|
103
92
|
def assert_input_sequence
|
|
104
93
|
fasta_content = IO.binread(@opt[:input_fasta_file])
|
|
105
94
|
type = BlastUtils.type_of_sequences(fasta_content)
|
|
106
|
-
return if
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
95
|
+
return if %i[nucleotide protein].include? type
|
|
96
|
+
warn '*** Error: The input files does not contain just protein'
|
|
97
|
+
warn ' or nucleotide data.'
|
|
98
|
+
warn ' Please correct this and try again.'
|
|
110
99
|
exit 1
|
|
111
100
|
end
|
|
112
101
|
|
|
@@ -116,8 +105,8 @@ module GeneValidator
|
|
|
116
105
|
if File.exist?(bin) && File.directory?(bin)
|
|
117
106
|
add_to_path(bin)
|
|
118
107
|
else
|
|
119
|
-
|
|
120
|
-
|
|
108
|
+
warn '*** The following bin directory does not exist:'
|
|
109
|
+
warn " #{bin}"
|
|
121
110
|
end
|
|
122
111
|
end
|
|
123
112
|
end
|
|
@@ -131,10 +120,10 @@ module GeneValidator
|
|
|
131
120
|
|
|
132
121
|
def assert_mafft_installation
|
|
133
122
|
return if command?('mafft')
|
|
134
|
-
|
|
135
|
-
|
|
123
|
+
warn '*** Could not find Mafft binaries.'
|
|
124
|
+
warn ' Ignoring error and continuing - Please note that' \
|
|
136
125
|
' some validations may be skipped.'
|
|
137
|
-
|
|
126
|
+
warn # a blank line
|
|
138
127
|
end
|
|
139
128
|
end
|
|
140
129
|
|
|
@@ -142,7 +131,7 @@ module GeneValidator
|
|
|
142
131
|
class Blast
|
|
143
132
|
class << self
|
|
144
133
|
# Use a fixed minimum version of BLAST+
|
|
145
|
-
MINIMUM_BLAST_VERSION = '2.2.30+'
|
|
134
|
+
MINIMUM_BLAST_VERSION = '2.2.30+'.freeze
|
|
146
135
|
# Use the following exit codes, or 1.
|
|
147
136
|
EXIT_BLAST_NOT_INSTALLED = 2
|
|
148
137
|
EXIT_BLAST_NOT_COMPATIBLE = 3
|
|
@@ -150,7 +139,6 @@ module GeneValidator
|
|
|
150
139
|
|
|
151
140
|
def validate(opt)
|
|
152
141
|
assert_blast_installation
|
|
153
|
-
warn_if_remote_database(opt)
|
|
154
142
|
assert_local_blast_database_exists(opt[:db]) if opt[:db] !~ /remote/
|
|
155
143
|
end
|
|
156
144
|
|
|
@@ -160,27 +148,10 @@ module GeneValidator
|
|
|
160
148
|
assert_blast_compatible
|
|
161
149
|
end
|
|
162
150
|
|
|
163
|
-
def warn_if_remote_database(opt)
|
|
164
|
-
return if opt[:db] !~ /remote/
|
|
165
|
-
$stderr.puts # a blank line
|
|
166
|
-
if !opt[:raw_sequences] &&
|
|
167
|
-
(opt[:validations].include?('align') ||
|
|
168
|
-
opt[:validations].include?('dup'))
|
|
169
|
-
$stderr.puts 'Warning: Hit sequences will be fetched from remote' \
|
|
170
|
-
' server.'
|
|
171
|
-
else
|
|
172
|
-
$stderr.puts 'Warning: BLAST will be carried out on remote server.'
|
|
173
|
-
end
|
|
174
|
-
$stderr.puts 'This may take quite a bit of time.'
|
|
175
|
-
$stderr.puts 'You may want to install a local BLAST database for' \
|
|
176
|
-
' faster analyses.'
|
|
177
|
-
$stderr.puts # a blank line
|
|
178
|
-
end
|
|
179
|
-
|
|
180
151
|
def assert_local_blast_database_exists(db)
|
|
181
152
|
return if system("blastdbcmd -db #{db} -info > /dev/null 2>&1")
|
|
182
|
-
|
|
183
|
-
|
|
153
|
+
warn '*** No BLAST database found at the provided path.'
|
|
154
|
+
warn ' Please ensure that the provided path is correct' \
|
|
184
155
|
' and then try again.'
|
|
185
156
|
exit EXIT_NO_BLAST_DATABASE
|
|
186
157
|
end
|
|
@@ -189,15 +160,15 @@ module GeneValidator
|
|
|
189
160
|
|
|
190
161
|
def assert_blast_installed
|
|
191
162
|
return if GVArgValidation.command?('blastdbcmd')
|
|
192
|
-
|
|
163
|
+
warn '*** Could not find BLAST+ binaries.'
|
|
193
164
|
exit EXIT_BLAST_NOT_INSTALLED
|
|
194
165
|
end
|
|
195
166
|
|
|
196
167
|
def assert_blast_compatible
|
|
197
168
|
version = `blastdbcmd -version`.split[1]
|
|
198
169
|
return if version >= MINIMUM_BLAST_VERSION
|
|
199
|
-
|
|
200
|
-
|
|
170
|
+
warn "*** Your BLAST+ version #{version} is outdated."
|
|
171
|
+
warn ' GeneValidator needs NCBI BLAST+ version' \
|
|
201
172
|
" #{MINIMUM_BLAST_VERSION} or higher."
|
|
202
173
|
exit EXIT_BLAST_NOT_COMPATIBLE
|
|
203
174
|
end
|
data/lib/genevalidator/blast.rb
CHANGED
|
@@ -12,34 +12,10 @@ module GeneValidator
|
|
|
12
12
|
class BlastUtils
|
|
13
13
|
class << self
|
|
14
14
|
extend Forwardable
|
|
15
|
-
def_delegators GeneValidator, :opt, :config
|
|
15
|
+
def_delegators GeneValidator, :opt, :config, :dirs
|
|
16
16
|
|
|
17
17
|
EVALUE = 1e-5
|
|
18
18
|
|
|
19
|
-
##
|
|
20
|
-
# Calls blast from standard input with specific parameters
|
|
21
|
-
# Params:
|
|
22
|
-
# +blast_type+: blast command in String format (e.g 'blast(x/p)')
|
|
23
|
-
# +query+: String containing the the query in fasta format
|
|
24
|
-
# +db+: database
|
|
25
|
-
# +num_threads+: The number of threads to run BLAST with.
|
|
26
|
-
# Output:
|
|
27
|
-
# String with the blast xml output
|
|
28
|
-
def run_blast(query, db = opt[:db], seq_type = config[:type],
|
|
29
|
-
num_threads = opt[:num_threads])
|
|
30
|
-
|
|
31
|
-
blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
|
|
32
|
-
# -num_threads is not supported on remote databases
|
|
33
|
-
threads = (db !~ /remote/) ? "-num_threads #{num_threads}" : ''
|
|
34
|
-
|
|
35
|
-
blastcmd = "#{blast_type} -db '#{db}' -evalue #{EVALUE} -outfmt 5" \
|
|
36
|
-
" #{threads}"
|
|
37
|
-
|
|
38
|
-
cmd = "echo \"#{query}\" | #{blastcmd}"
|
|
39
|
-
`#{cmd} >/dev/null 2>&1`
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
##
|
|
43
19
|
# Runs BLAST on an input file
|
|
44
20
|
# Params:
|
|
45
21
|
# +blast_type+: blast command in String format (e.g 'blastx' or 'blastp')
|
|
@@ -49,32 +25,22 @@ module GeneValidator
|
|
|
49
25
|
# +nr_hits+: max number of hits
|
|
50
26
|
# Output:
|
|
51
27
|
# XML file
|
|
52
|
-
def run_blast_on_input_file
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
return if opt[:blast_xml_file] || opt[:blast_tabular_file]
|
|
28
|
+
def run_blast_on_input_file
|
|
29
|
+
remote = opt[:db].match?(/remote/) ? true : false
|
|
30
|
+
print_blast_info_text(remote)
|
|
56
31
|
|
|
57
|
-
|
|
58
|
-
opt
|
|
32
|
+
log_file = File.join(dirs[:tmp_dir], 'blast_cmd_output.txt')
|
|
33
|
+
`#{blast_cmd(opt, config, remote)} > #{log_file} 2>&1`
|
|
59
34
|
|
|
60
|
-
blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
|
|
61
|
-
# -num_threads is not supported on remote databases
|
|
62
|
-
threads = (opt[:db] !~ /remote/) ? "-num_threads #{num_threads}" : ''
|
|
63
|
-
|
|
64
|
-
blastcmd = "#{blast_type} -query '#{input_file}'" \
|
|
65
|
-
" -out '#{opt[:blast_xml_file]}' -db #{db} " \
|
|
66
|
-
" -evalue #{EVALUE} -outfmt 5 #{threads}"
|
|
67
|
-
|
|
68
|
-
`#{blastcmd} >/dev/null 2>&1`
|
|
69
35
|
return unless File.zero?(opt[:blast_xml_file])
|
|
70
|
-
|
|
71
|
-
if
|
|
72
|
-
|
|
73
|
-
|
|
36
|
+
warn 'Blast failed to run on the input file.'
|
|
37
|
+
if remote
|
|
38
|
+
warn 'You are using BLAST with a remote database. Please'
|
|
39
|
+
warn 'ensure that you have internet access and try again.'
|
|
74
40
|
else
|
|
75
|
-
|
|
76
|
-
$stderr.puts 'ensure that you have internet access and try again.'
|
|
41
|
+
warn 'Please ensure that the BLAST database exists and try again.'
|
|
77
42
|
end
|
|
43
|
+
exit 1
|
|
78
44
|
end
|
|
79
45
|
|
|
80
46
|
##
|
|
@@ -84,66 +50,24 @@ module GeneValidator
|
|
|
84
50
|
# +type+: the type of the sequence: :nucleotide or :protein
|
|
85
51
|
# Outputs:
|
|
86
52
|
# Array of +Sequence+ objects corresponding to the list of hits
|
|
87
|
-
def parse_next(iterator
|
|
88
|
-
hits = []
|
|
53
|
+
def parse_next(iterator)
|
|
89
54
|
iter = iterator.next
|
|
90
55
|
|
|
91
56
|
# parse blast the xml output and get the hits
|
|
92
57
|
# hits obtained are proteins! (we use only blastp and blastx)
|
|
58
|
+
hits = []
|
|
93
59
|
iter.each do |hit|
|
|
94
|
-
seq
|
|
95
|
-
|
|
60
|
+
seq = Query.new
|
|
96
61
|
seq.length_protein = hit.len.to_i
|
|
97
62
|
seq.type = :protein
|
|
98
63
|
seq.identifier = hit.hit_id
|
|
99
64
|
seq.definition = hit.hit_def
|
|
100
|
-
seq.accession_no
|
|
101
|
-
|
|
102
|
-
# get all high-scoring segment pairs (hsp)
|
|
103
|
-
hsps = []
|
|
104
|
-
|
|
105
|
-
hit.hsps.each do |hsp|
|
|
106
|
-
current_hsp = Hsp.new
|
|
107
|
-
current_hsp.hsp_evalue = format('%.0e', hsp.evalue)
|
|
65
|
+
seq.accession_no = hit.accession
|
|
66
|
+
seq.hsp_list = hit.hsps.map { |hsp| Hsp.new(xml_input: hsp) }
|
|
108
67
|
|
|
109
|
-
|
|
110
|
-
current_hsp.hit_to = hsp.hit_to.to_i
|
|
111
|
-
current_hsp.match_query_from = hsp.query_from.to_i
|
|
112
|
-
current_hsp.match_query_to = hsp.query_to.to_i
|
|
113
|
-
|
|
114
|
-
if type == :nucleotide
|
|
115
|
-
current_hsp.match_query_from /= 3
|
|
116
|
-
current_hsp.match_query_to /= 3
|
|
117
|
-
current_hsp.match_query_from += 1
|
|
118
|
-
current_hsp.match_query_to += 1
|
|
119
|
-
end
|
|
120
|
-
|
|
121
|
-
current_hsp.query_reading_frame = hsp.query_frame.to_i
|
|
122
|
-
|
|
123
|
-
current_hsp.hit_alignment = hsp.hseq.to_s
|
|
124
|
-
seq_type = guess_sequence_type(current_hsp.hit_alignment)
|
|
125
|
-
fail SequenceTypeError unless seq_type == :protein || seq_type.nil?
|
|
126
|
-
|
|
127
|
-
current_hsp.query_alignment = hsp.qseq.to_s
|
|
128
|
-
seq_type = guess_sequence_type(current_hsp.query_alignment)
|
|
129
|
-
fail SequenceTypeError unless seq_type == :protein || seq_type.nil?
|
|
130
|
-
|
|
131
|
-
current_hsp.align_len = hsp.align_len.to_i
|
|
132
|
-
current_hsp.identity = hsp.identity.to_i
|
|
133
|
-
current_hsp.pidentity = (100 * hsp.identity / hsp.align_len.to_f)
|
|
134
|
-
.round(2)
|
|
135
|
-
|
|
136
|
-
hsps.push(current_hsp)
|
|
137
|
-
end
|
|
138
|
-
|
|
139
|
-
seq.hsp_list = hsps
|
|
140
|
-
hits.push(seq)
|
|
68
|
+
hits << seq
|
|
141
69
|
end
|
|
142
|
-
|
|
143
70
|
hits
|
|
144
|
-
rescue SequenceTypeError => e
|
|
145
|
-
$stderr.puts e
|
|
146
|
-
exit 1
|
|
147
71
|
rescue StopIteration
|
|
148
72
|
nil
|
|
149
73
|
end
|
|
@@ -164,7 +88,7 @@ module GeneValidator
|
|
|
164
88
|
sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?)
|
|
165
89
|
# get all sequence types
|
|
166
90
|
sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }
|
|
167
|
-
|
|
91
|
+
.uniq.compact
|
|
168
92
|
|
|
169
93
|
return nil if sequence_types.empty?
|
|
170
94
|
sequence_types.first if sequence_types.length == 1
|
|
@@ -184,7 +108,7 @@ module GeneValidator
|
|
|
184
108
|
return nil if cleaned_sequence.length < 10 # conservative
|
|
185
109
|
|
|
186
110
|
type = Bio::Sequence.new(cleaned_sequence).guess(0.9)
|
|
187
|
-
|
|
111
|
+
type == Bio::Sequence::NA ? :nucleotide : :protein
|
|
188
112
|
end
|
|
189
113
|
|
|
190
114
|
##
|
|
@@ -192,11 +116,32 @@ module GeneValidator
|
|
|
192
116
|
def guess_sequence_type_from_input_file(file = opt[:input_fasta_file])
|
|
193
117
|
lines = File.foreach(file).first(10)
|
|
194
118
|
seqs = ''
|
|
195
|
-
lines.each
|
|
196
|
-
seqs += l.chomp unless l[0] == '>'
|
|
197
|
-
end
|
|
119
|
+
lines.each { |l| seqs += l.chomp unless l[0] == '>' }
|
|
198
120
|
guess_sequence_type(seqs)
|
|
199
121
|
end
|
|
122
|
+
|
|
123
|
+
private
|
|
124
|
+
|
|
125
|
+
def blast_cmd(opt, config, remote)
|
|
126
|
+
blast_type = config[:type] == :protein ? 'blastp' : 'blastx'
|
|
127
|
+
# -num_threads is not supported on remote databases
|
|
128
|
+
threads = remote ? '' : "-num_threads #{opt[:num_threads]}"
|
|
129
|
+
|
|
130
|
+
"#{blast_type} -query '#{opt[:input_fasta_file]}'" \
|
|
131
|
+
" -db #{opt[:db]} -outfmt 5 -evalue #{EVALUE} #{threads}" \
|
|
132
|
+
" -out '#{opt[:blast_xml_file]}' #{opt[:blast_options]}"
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def print_blast_info_text(remote)
|
|
136
|
+
warn '' # a blank line
|
|
137
|
+
if remote
|
|
138
|
+
warn '==> BLAST search and subsequent analysis will be done on a remote'
|
|
139
|
+
warn ' database. Please use a local database for larger analysis.'
|
|
140
|
+
else
|
|
141
|
+
warn '==> Running BLAST. This may take a while.'
|
|
142
|
+
end
|
|
143
|
+
warn '' # a blank line
|
|
144
|
+
end
|
|
200
145
|
end
|
|
201
146
|
end
|
|
202
147
|
end
|
|
@@ -13,7 +13,7 @@ module GeneValidator
|
|
|
13
13
|
end
|
|
14
14
|
|
|
15
15
|
def print
|
|
16
|
-
|
|
16
|
+
warn "Cluster: #{x} #{y}"
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
##
|
|
@@ -44,7 +44,7 @@ module GeneValidator
|
|
|
44
44
|
# Overload quality operator
|
|
45
45
|
# Returns true if the pairs are equal, false otherwise
|
|
46
46
|
def ==(other)
|
|
47
|
-
|
|
47
|
+
other.x == x && other.y == y ? true : false
|
|
48
48
|
end
|
|
49
49
|
|
|
50
50
|
def eql?(other)
|
|
@@ -66,7 +66,7 @@ module GeneValidator
|
|
|
66
66
|
|
|
67
67
|
def print
|
|
68
68
|
objects.each do |elem|
|
|
69
|
-
|
|
69
|
+
warn "(#{elem[0].x},#{elem[0].y}): #{elem[1]}"
|
|
70
70
|
end
|
|
71
71
|
end
|
|
72
72
|
|
|
@@ -254,7 +254,7 @@ module GeneValidator
|
|
|
254
254
|
# Real number
|
|
255
255
|
def deviation(clusters, queryLength)
|
|
256
256
|
hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten
|
|
257
|
-
raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.
|
|
257
|
+
raw_hits = clusters.map { |c| c.lengths.map { |x| Array.new(x[1], x[0]) }.flatten }.flatten.to_s.delete('[').delete(']')
|
|
258
258
|
R.eval("sd = sd(c(#{raw_hits}))")
|
|
259
259
|
sd = R.pull('sd')
|
|
260
260
|
sd = standard_deviation(hits)
|
|
@@ -273,11 +273,11 @@ module GeneValidator
|
|
|
273
273
|
##
|
|
274
274
|
# Prints the current cluster
|
|
275
275
|
def print
|
|
276
|
-
|
|
276
|
+
warn "Cluster: mean = #{mean}, density = #{density}"
|
|
277
277
|
lengths.sort { |a, b| a <=> b }.each do |elem|
|
|
278
|
-
|
|
278
|
+
warn "#{elem[0]}, #{elem[1]}"
|
|
279
279
|
end
|
|
280
|
-
|
|
280
|
+
warn '--------------------------'
|
|
281
281
|
end
|
|
282
282
|
|
|
283
283
|
##
|
|
@@ -337,7 +337,7 @@ module GeneValidator
|
|
|
337
337
|
# clusters = array of clusters
|
|
338
338
|
# initially each length belongs to a different cluster
|
|
339
339
|
histogram.each do |e|
|
|
340
|
-
|
|
340
|
+
warn "pair (#{e[0].x} #{e[0].y}) appears #{e[1]} times" if debug
|
|
341
341
|
hash = { e[0] => e[1] }
|
|
342
342
|
cluster = PairCluster.new(hash)
|
|
343
343
|
clusters.push(cluster)
|
|
@@ -355,7 +355,7 @@ module GeneValidator
|
|
|
355
355
|
break if no_clusters != 0 && clusters.length == no_clusters
|
|
356
356
|
|
|
357
357
|
iteration += iteration
|
|
358
|
-
|
|
358
|
+
warn "\nIteration #{iteration}" if debug
|
|
359
359
|
|
|
360
360
|
min_distance = 100_000_000
|
|
361
361
|
cluster1 = 0
|
|
@@ -365,9 +365,7 @@ module GeneValidator
|
|
|
365
365
|
[*(0..(clusters.length - 2))].each do |i|
|
|
366
366
|
[*((i + 1)..(clusters.length - 1))].each do |j|
|
|
367
367
|
dist = clusters[i].distance(clusters[j], distance_method)
|
|
368
|
-
if debug
|
|
369
|
-
$stderr.puts "distance between clusters #{i} and #{j} is #{dist}"
|
|
370
|
-
end
|
|
368
|
+
warn "distance between clusters #{i} and #{j} is #{dist}" if debug
|
|
371
369
|
current_density = clusters[i].density + clusters[j].density
|
|
372
370
|
if dist < min_distance
|
|
373
371
|
min_distance = dist
|
|
@@ -383,14 +381,14 @@ module GeneValidator
|
|
|
383
381
|
end
|
|
384
382
|
|
|
385
383
|
# merge clusters 'cluster1' and 'cluster2'
|
|
386
|
-
|
|
384
|
+
warn "clusters to merge #{cluster1} and #{cluster2}" if debug
|
|
387
385
|
|
|
388
386
|
clusters[cluster1].add(clusters[cluster2])
|
|
389
387
|
clusters.delete_at(cluster2)
|
|
390
388
|
|
|
391
389
|
if debug
|
|
392
390
|
clusters.each_with_index do |elem, i|
|
|
393
|
-
|
|
391
|
+
warn "cluster #{i}"
|
|
394
392
|
elem.print
|
|
395
393
|
end
|
|
396
394
|
end
|
|
@@ -437,8 +435,8 @@ module GeneValidator
|
|
|
437
435
|
|
|
438
436
|
# clusters = array of clusters
|
|
439
437
|
# initially each length belongs to a different cluster
|
|
440
|
-
histogram.
|
|
441
|
-
|
|
438
|
+
histogram.sort_by { |a| a[0] }.each do |elem|
|
|
439
|
+
warn "len #{elem[0]} appears #{elem[1]} times" if debug
|
|
442
440
|
hash = { elem[0] => elem[1] }
|
|
443
441
|
cluster = Cluster.new(hash)
|
|
444
442
|
clusters.push(cluster)
|
|
@@ -456,7 +454,7 @@ module GeneValidator
|
|
|
456
454
|
break if no_clusters != 0 && clusters.length == no_clusters
|
|
457
455
|
|
|
458
456
|
iteration += iteration
|
|
459
|
-
|
|
457
|
+
warn "\nIteration #{iteration}" if debug
|
|
460
458
|
|
|
461
459
|
min_distance = 100_000_000
|
|
462
460
|
cluster = 0
|
|
@@ -464,9 +462,7 @@ module GeneValidator
|
|
|
464
462
|
|
|
465
463
|
clusters[0..clusters.length - 2].each_with_index do |_item, i|
|
|
466
464
|
dist = clusters[i].distance(clusters[i + 1], distance_method)
|
|
467
|
-
if debug
|
|
468
|
-
$stderr.puts "distance btwn clusters #{i} and #{i + 1} is #{dist}"
|
|
469
|
-
end
|
|
465
|
+
warn "distance btwn clusters #{i} and #{i + 1} is #{dist}" if debug
|
|
470
466
|
current_density = clusters[i].density + clusters[i + 1].density
|
|
471
467
|
if dist < min_distance
|
|
472
468
|
min_distance = dist
|
|
@@ -485,14 +481,14 @@ module GeneValidator
|
|
|
485
481
|
end
|
|
486
482
|
|
|
487
483
|
# merge clusters 'cluster' and 'cluster'+1
|
|
488
|
-
|
|
484
|
+
warn "clusters to merge #{cluster} and #{cluster + 1}" if debug
|
|
489
485
|
|
|
490
486
|
clusters[cluster].add(clusters[cluster + 1])
|
|
491
487
|
clusters.delete_at(cluster + 1)
|
|
492
488
|
|
|
493
489
|
if debug
|
|
494
490
|
clusters.each_with_index do |elem, i|
|
|
495
|
-
|
|
491
|
+
warn "cluster #{i}"
|
|
496
492
|
elem.print
|
|
497
493
|
end
|
|
498
494
|
end
|