sequenceserver 2.1.0 → 3.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of sequenceserver might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/COPYRIGHT.txt +1 -1
- data/bin/sequenceserver +10 -3
- data/lib/sequenceserver/blast/error.rb +53 -0
- data/lib/sequenceserver/blast/formatter.rb +13 -4
- data/lib/sequenceserver/blast/job.rb +2 -43
- data/lib/sequenceserver/blast/report.rb +33 -3
- data/lib/sequenceserver/config.rb +4 -1
- data/lib/sequenceserver/job.rb +21 -11
- data/lib/sequenceserver/makeblastdb-modified-with-cache.rb +345 -0
- data/lib/sequenceserver/makeblastdb.rb +97 -75
- data/lib/sequenceserver/pool.rb +1 -1
- data/lib/sequenceserver/report.rb +1 -5
- data/lib/sequenceserver/routes.rb +52 -5
- data/lib/sequenceserver/server.rb +1 -1
- data/lib/sequenceserver/sys.rb +1 -1
- data/lib/sequenceserver/version.rb +1 -1
- data/lib/sequenceserver.rb +11 -2
- data/public/404.html +27 -0
- data/public/config.js +0 -6
- data/public/css/grapher.css +1 -1
- data/public/css/sequenceserver.css +22 -11
- data/public/css/sequenceserver.min.css +2 -2
- data/public/js/circos.js +7 -3
- data/public/js/dnd.js +3 -3
- data/public/js/fastq_to_fasta.js +35 -0
- data/public/js/form.js +30 -11
- data/public/js/grapher.js +123 -113
- data/public/js/hit.js +8 -2
- data/public/js/hits_overview.js +4 -1
- data/public/js/jquery_world.js +0 -1
- data/public/js/kablammo.js +4 -0
- data/public/js/length_distribution.js +5 -1
- data/public/js/null_plugins/download_links.js +7 -0
- data/public/js/null_plugins/hit_buttons.js +11 -0
- data/public/js/null_plugins/report_plugins.js +18 -0
- data/public/js/query.js +26 -6
- data/public/js/report.js +92 -22
- data/public/js/search.js +0 -8
- data/public/js/sidebar.js +11 -1
- data/public/js/tests/advanced_parameters.spec.js +36 -0
- data/public/js/tests/mock_data/sequences.js +49 -0
- data/public/js/tests/report.spec.js +62 -6
- data/public/js/tests/search_query.spec.js +45 -19
- data/public/js/visualisation_helpers.js +1 -1
- data/public/sequenceserver-report.min.js +76 -42
- data/public/sequenceserver-search.min.js +34 -33
- data/views/layout.erb +9 -12
- metadata +34 -23
@@ -0,0 +1,345 @@
|
|
1
|
+
require 'find'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module SequenceServer
|
5
|
+
# Smart `makeblastdb` wrapper: recursively scans database directory determining
|
6
|
+
# which files need to be formatted or re-formatted.
|
7
|
+
#
|
8
|
+
# Example usage:
|
9
|
+
#
|
10
|
+
# makeblastdb = MAKEBLASTDB.new(database_dir)
|
11
|
+
# makeblastdb.scan && makeblastdb.run
|
12
|
+
#
|
13
|
+
class MAKEBLASTDB
|
14
|
+
extend Forwardable
|
15
|
+
|
16
|
+
def_delegators SequenceServer, :config, :sys, :logger
|
17
|
+
|
18
|
+
def initialize(database_dir)
|
19
|
+
@database_dir = database_dir
|
20
|
+
end
|
21
|
+
|
22
|
+
attr_reader :database_dir
|
23
|
+
attr_reader :formatted_fastas
|
24
|
+
attr_reader :fastas_to_format
|
25
|
+
attr_reader :fastas_to_reformat
|
26
|
+
|
27
|
+
# Scans the database directory to determine which FASTA files require
|
28
|
+
# formatting or re-formatting.
|
29
|
+
#
|
30
|
+
# Returns `true` if there are files to (re-)format, `false` otherwise.
|
31
|
+
def scan
|
32
|
+
# We need to know the list of formatted FASTAs as reported by blastdbcmd
|
33
|
+
# first. This is required to determine both unformatted FASTAs and those
|
34
|
+
# that require reformatting.
|
35
|
+
@formatted_fastas = []
|
36
|
+
determine_formatted_fastas
|
37
|
+
|
38
|
+
# Now determine FASTA files that are unformatted or require reformatting.
|
39
|
+
@fastas_to_format = []
|
40
|
+
determine_unformatted_fastas
|
41
|
+
@fastas_to_reformat = []
|
42
|
+
determine_fastas_to_reformat
|
43
|
+
|
44
|
+
# Return true if there are files to be (re-)formatted or false otherwise.
|
45
|
+
!@fastas_to_format.empty? || !@fastas_to_reformat.empty?
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns true if at least one database in database directory is formatted.
|
49
|
+
def any_formatted?
|
50
|
+
!@formatted_fastas.empty?
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns true if there is at least one unformatted FASTA in the databases
|
54
|
+
# directory.
|
55
|
+
def any_unformatted?
|
56
|
+
!@fastas_to_format.empty?
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns true if the databases directory contains one or more incompatible
|
60
|
+
# databases.
|
61
|
+
#
|
62
|
+
# Note that it is okay to only use V4 databases or only V5 databases.
|
63
|
+
# Incompatibility arises when they are mixed.
|
64
|
+
def any_incompatible?
|
65
|
+
return false if @formatted_fastas.all? { |ff| ff.v4? || ff.alias? }
|
66
|
+
return false if @formatted_fastas.all? { |ff| ff.v5? || ff.alias? }
|
67
|
+
true
|
68
|
+
end
|
69
|
+
|
70
|
+
# Runs makeblastdb on each file in `@fastas_to_format` and
|
71
|
+
# `@fastas_to_reformat`. Will do nothing unless `#scan`
|
72
|
+
# has been run before.
|
73
|
+
def run
|
74
|
+
format
|
75
|
+
reformat
|
76
|
+
end
|
77
|
+
|
78
|
+
# Format any unformatted FASTA files in database directory. Returns Array
|
79
|
+
# of files that were formatted.
|
80
|
+
def format
|
81
|
+
# Make the intent clear as well as ensure the program won't crash if we
|
82
|
+
# accidentally call format before calling scan.
|
83
|
+
return unless @fastas_to_format
|
84
|
+
@fastas_to_format.select do |path, title, type|
|
85
|
+
make_blast_database('format', path, title, type)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Re-format databases that require reformatting. Returns Array of files
|
90
|
+
# that were reformatted.
|
91
|
+
def reformat
|
92
|
+
# Make the intent clear as well as ensure the program won't crash if
|
93
|
+
# we accidentally call reformat before calling scan.
|
94
|
+
return unless @fastas_to_reformat
|
95
|
+
@fastas_to_reformat.select do |path, title, type, non_parse_seqids|
|
96
|
+
make_blast_database('reformat', path, title, type, non_parse_seqids)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
# Determines which FASTA files in the database directory are already
|
103
|
+
# formatted. Adds to @formatted_fastas.
|
104
|
+
def determine_formatted_fastas
|
105
|
+
blastdbcmd.each_line do |line|
|
106
|
+
path, *rest = line.chomp.split("\t")
|
107
|
+
next if multipart_database_name?(path)
|
108
|
+
rest << get_categories(path)
|
109
|
+
@formatted_fastas << Database.new(path, *rest)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# Determines which FASTA files in the database directory require
|
114
|
+
# reformatting. Adds to @fastas_to_format.
|
115
|
+
def determine_fastas_to_reformat
|
116
|
+
@formatted_fastas.each do |ff|
|
117
|
+
if ff.v4? || ff.non_parse_seqids?
|
118
|
+
@fastas_to_reformat << [ff.path, ff.title, ff.type, ff.non_parse_seqids?]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Determines which FASTA files in the database directory are
|
124
|
+
# unformatted. Adds to @fastas_to_format.
|
125
|
+
def determine_unformatted_fastas
|
126
|
+
# Add a trailing slash to database_dir - Find.find doesn't work as
|
127
|
+
# expected without the trailing slash if database_dir is a symlink
|
128
|
+
# inside a docker container.
|
129
|
+
Find.find(database_dir + '/') do |path|
|
130
|
+
next if File.directory?(path)
|
131
|
+
next unless probably_fasta?(path)
|
132
|
+
next if @formatted_fastas.any? { |f| f[0] == path }
|
133
|
+
|
134
|
+
@fastas_to_format << [path,
|
135
|
+
make_db_title(path),
|
136
|
+
guess_sequence_type_in_fasta(path)]
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Runs `blastdbcmd` to determine formatted FASTA files in the database
|
141
|
+
# directory. Returns the output of `blastdbcmd`. This method is called
|
142
|
+
# by `determine_formatted_fastas`.
|
143
|
+
def blastdbcmd
|
144
|
+
# calculate checksum of database directory
|
145
|
+
current_db_checksum = Zlib::crc32(Dir.glob(File.join(config[:database_dir], '/**/*')).map {
|
146
|
+
|path| path.to_s + "_" + File.mtime(path).to_s + "_" + File.size(path).to_s
|
147
|
+
}.to_s)
|
148
|
+
|
149
|
+
checksum_path = config[:database_dir].chomp('/') + '.checksum'
|
150
|
+
index_path = config[:database_dir].chomp('/') + '.index'
|
151
|
+
|
152
|
+
if File.exists?(checksum_path)
|
153
|
+
if current_db_checksum == File.read(checksum_path).to_i # db directory hasn't changed
|
154
|
+
if File.exists?(index_path) # lets use existing index
|
155
|
+
logger.info "Using existing database index: #{index_path}"
|
156
|
+
return File.read(index_path)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end rescue logger.error "Could not read: #{checksum_path} or #{index_path}"
|
160
|
+
|
161
|
+
# database directory has changed, or index file doesn't exist
|
162
|
+
# thus we run blastdbcmd to get formatted FASTA files
|
163
|
+
logger.info "Scanning for BLAST databases & creating index"
|
164
|
+
cmd = "blastdbcmd -recursive -list #{config[:database_dir]}" \
|
165
|
+
' -list_outfmt "%f %t %p %n %l %d %v"'
|
166
|
+
out, err = sys(cmd, path: config[:bin])
|
167
|
+
errpat = /BLAST Database error/
|
168
|
+
fail BLAST_DATABASE_ERROR.new(cmd, err) if err.match(errpat)
|
169
|
+
|
170
|
+
# write checksum and index to file
|
171
|
+
File.open(checksum_path, 'w') { |f| f.write(current_db_checksum) } rescue
|
172
|
+
logger.error "Could not write database checksum to file" + checksum_path
|
173
|
+
File.open(index_path, 'w') { |f| f.write(out) } rescue
|
174
|
+
logger.error "Could not write database index to file" + index_path
|
175
|
+
|
176
|
+
return out
|
177
|
+
rescue CommandFailed => e
|
178
|
+
fail BLAST_DATABASE_ERROR.new(cmd, e.stderr)
|
179
|
+
end
|
180
|
+
|
181
|
+
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
182
|
+
def make_blast_database(action, file, title, type, non_parse_seqids = false)
|
183
|
+
return unless make_blast_database?(action, file, type)
|
184
|
+
title = confirm_database_title(title)
|
185
|
+
extract_fasta(file) unless File.exist?(file)
|
186
|
+
taxonomy = taxid_map(file, non_parse_seqids) || taxid
|
187
|
+
_make_blast_database(file, type, title, taxonomy)
|
188
|
+
end
|
189
|
+
|
190
|
+
# Show file path and guessed sequence type to the user and obtain a y/n
|
191
|
+
# response.
|
192
|
+
#
|
193
|
+
# Returns true if the user entered anything but 'n' or 'N'.
|
194
|
+
def make_blast_database?(action, file, type)
|
195
|
+
puts
|
196
|
+
puts
|
197
|
+
puts "FASTA file to #{action}: #{file}"
|
198
|
+
puts "FASTA type: #{type}"
|
199
|
+
print 'Proceed? [y/n] (Default: y): '
|
200
|
+
response = STDIN.gets.to_s.strip
|
201
|
+
!response.match(/n/i)
|
202
|
+
end
|
203
|
+
|
204
|
+
# Show the database title that we are going to use to the user for
|
205
|
+
# confirmation.
|
206
|
+
#
|
207
|
+
# Returns user input if any. Auto-determined title otherwise.
|
208
|
+
def confirm_database_title(default)
|
209
|
+
print "Enter a database title or will use '#{default}': "
|
210
|
+
from_user = STDIN.gets.to_s.strip
|
211
|
+
from_user.empty? && default || from_user
|
212
|
+
end
|
213
|
+
|
214
|
+
# Check if a '.taxid_map.txt' file exists. If not, try getting it
|
215
|
+
# using blastdbcmd.
|
216
|
+
def taxid_map(db, non_parse_seqids)
|
217
|
+
return if non_parse_seqids
|
218
|
+
taxid_map = db.sub(/#{File.extname(db)}$/, '.taxid_map.txt')
|
219
|
+
extract_taxid_map(db, taxid_map) if !File.exist?(taxid_map)
|
220
|
+
"-taxid_map #{taxid_map}" if !File.zero?(taxid_map)
|
221
|
+
end
|
222
|
+
|
223
|
+
# Get taxid from the user. Returns user input or 0.
|
224
|
+
#
|
225
|
+
# Using 0 as taxid is equivalent to not setting taxid for the database
|
226
|
+
# that will be created.
|
227
|
+
def taxid
|
228
|
+
default = 0
|
229
|
+
print 'Enter taxid (optional): '
|
230
|
+
user_response = STDIN.gets.strip
|
231
|
+
"-taxid #{user_response.empty? && default || Integer(user_response)}"
|
232
|
+
rescue ArgumentError # presumably from call to Interger()
|
233
|
+
puts 'taxid should be a number'
|
234
|
+
retry
|
235
|
+
end
|
236
|
+
|
237
|
+
def _make_blast_database(file, type, title, taxonomy)
|
238
|
+
cmd = "makeblastdb -parse_seqids -hash_index -in '#{file}'" \
|
239
|
+
" -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
240
|
+
" #{taxonomy}"
|
241
|
+
out, err = sys(cmd, path: config[:bin])
|
242
|
+
puts out.strip
|
243
|
+
puts err.strip
|
244
|
+
return true
|
245
|
+
rescue CommandFailed => e
|
246
|
+
puts <<~MSG
|
247
|
+
Could not create BLAST database for: #{file}
|
248
|
+
Tried: #{cmd}
|
249
|
+
stdout: #{e.stdout}
|
250
|
+
stderr: #{e.stderr}
|
251
|
+
MSG
|
252
|
+
exit!
|
253
|
+
end
|
254
|
+
|
255
|
+
# Extract FASTA file from BLAST database.
|
256
|
+
#
|
257
|
+
# Invoked while reformatting a BLAST database if the corresponding
|
258
|
+
# FASTA file does not exist.
|
259
|
+
def extract_fasta(db)
|
260
|
+
puts
|
261
|
+
puts 'Extracting sequences ...'
|
262
|
+
cmd = "blastdbcmd -entry all -db #{db}"
|
263
|
+
sys(cmd, stdout: db, path: config[:bin])
|
264
|
+
rescue CommandFailed => e
|
265
|
+
puts <<~MSG
|
266
|
+
Could not extract sequences from: #{db}
|
267
|
+
Tried: #{cmd}
|
268
|
+
stdout: #{e.stdout}
|
269
|
+
stderr: #{e.stderr}
|
270
|
+
MSG
|
271
|
+
exit!
|
272
|
+
end
|
273
|
+
|
274
|
+
def extract_taxid_map(db, taxmap_file)
|
275
|
+
cmd = "blastdbcmd -entry all -db #{db} -outfmt '%i %T'"
|
276
|
+
sys(cmd, stdout: taxmap_file, path: config[:bin])
|
277
|
+
rescue CommandFailed => e
|
278
|
+
# silence error
|
279
|
+
end
|
280
|
+
|
281
|
+
# Returns true if the database name appears to be a multi-part database
|
282
|
+
# name.
|
283
|
+
#
|
284
|
+
# e.g.
|
285
|
+
# /home/ben/pd.ben/sequenceserver/db/nr.00 => yes
|
286
|
+
# /home/ben/pd.ben/sequenceserver/db/nr => no
|
287
|
+
# /home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01 => yes
|
288
|
+
# /home/ben/pd.ben/sequenceserver/db/nr00 => no
|
289
|
+
# /mnt/blast-db/refseq_genomic.100 => yes
|
290
|
+
def multipart_database_name?(db_name)
|
291
|
+
!(db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?)
|
292
|
+
end
|
293
|
+
|
294
|
+
def get_categories(path)
|
295
|
+
path
|
296
|
+
.gsub(config[:database_dir], '') # remove database_dir from path
|
297
|
+
.split('/')
|
298
|
+
.reject(&:empty?)[0..-2] # the first entry might be '' if database_dir does not end with /
|
299
|
+
end
|
300
|
+
|
301
|
+
# Returns true if first character of the file is '>'.
|
302
|
+
def probably_fasta?(file)
|
303
|
+
return false unless file.match(/((cds)|(fasta)|(fna)|(pep)|(cdna)|(fa)|(prot)|(fas)|(genome)|(nuc)|(dna)|(nt))$/i)
|
304
|
+
File.read(file, 1) == '>'
|
305
|
+
end
|
306
|
+
|
307
|
+
# Suggests improved titles when generating database names from files
|
308
|
+
# for improved apperance and readability in web interface.
|
309
|
+
# For example:
|
310
|
+
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
|
311
|
+
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
|
312
|
+
def make_db_title(path)
|
313
|
+
db_name = File.basename(path)
|
314
|
+
db_name.tr!('"', "'")
|
315
|
+
# removes .fasta like extension names
|
316
|
+
db_name.gsub!(File.extname(db_name), '')
|
317
|
+
# replaces _ with ' ',
|
318
|
+
db_name.gsub!(/(_)/, ' ')
|
319
|
+
# replaces '.' with ' ' when no numbers are on either side,
|
320
|
+
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
|
321
|
+
# preserves version numbers
|
322
|
+
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
|
323
|
+
db_name
|
324
|
+
end
|
325
|
+
|
326
|
+
# Guess whether FASTA file contains protein or nucleotide sequences by
|
327
|
+
# sampling a few few characters of the file.
|
328
|
+
def guess_sequence_type_in_fasta(file)
|
329
|
+
sequences = sample_sequences(file)
|
330
|
+
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
|
331
|
+
sequence_types = sequence_types.uniq.compact
|
332
|
+
(sequence_types.length == 1) && sequence_types.first
|
333
|
+
end
|
334
|
+
|
335
|
+
# Read first 1,048,576 characters of the file, split the read text on
|
336
|
+
# fasta def line pattern and return.
|
337
|
+
#
|
338
|
+
# If the given file is FASTA, returns Array of as many different
|
339
|
+
# sequences in the portion of the file read. Returns the portion
|
340
|
+
# of the file read wrapped in an Array otherwise.
|
341
|
+
def sample_sequences(file)
|
342
|
+
File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
end
|
@@ -8,7 +8,8 @@ module SequenceServer
|
|
8
8
|
# Example usage:
|
9
9
|
#
|
10
10
|
# makeblastdb = MAKEBLASTDB.new(database_dir)
|
11
|
-
# makeblastdb.
|
11
|
+
# makeblastdb.run # formats and re-formats databases in database_dir
|
12
|
+
# makeblastdb.formatted_fastas # lists formatted databases
|
12
13
|
#
|
13
14
|
class MAKEBLASTDB
|
14
15
|
extend Forwardable
|
@@ -20,56 +21,21 @@ module SequenceServer
|
|
20
21
|
end
|
21
22
|
|
22
23
|
attr_reader :database_dir
|
23
|
-
attr_reader :formatted_fastas
|
24
|
-
attr_reader :fastas_to_format
|
25
|
-
attr_reader :fastas_to_reformat
|
26
24
|
|
27
|
-
# Scans the database directory to determine which FASTA files require
|
28
|
-
# formatting or re-formatting.
|
29
|
-
#
|
30
|
-
# Returns `true` if there are files to (re-)format, `false` otherwise.
|
31
|
-
def scan
|
32
|
-
# We need to know the list of formatted FASTAs as reported by blastdbcmd
|
33
|
-
# first. This is required to determine both unformatted FASTAs and those
|
34
|
-
# that require reformatting.
|
35
|
-
@formatted_fastas = []
|
36
|
-
determine_formatted_fastas
|
37
|
-
|
38
|
-
# Now determine FASTA files that are unformatted or require reformatting.
|
39
|
-
@fastas_to_format = []
|
40
|
-
determine_unformatted_fastas
|
41
|
-
@fastas_to_reformat = []
|
42
|
-
determine_fastas_to_reformat
|
43
|
-
|
44
|
-
# Return true if there are files to be (re-)formatted or false otherwise.
|
45
|
-
!@fastas_to_format.empty? || !@fastas_to_reformat.empty?
|
46
|
-
end
|
47
|
-
|
48
|
-
# Returns true if at least one database in database directory is formatted.
|
49
25
|
def any_formatted?
|
50
|
-
|
26
|
+
formatted_fastas.any?
|
51
27
|
end
|
52
28
|
|
53
|
-
|
54
|
-
|
55
|
-
def any_unformatted?
|
56
|
-
!@fastas_to_format.empty?
|
29
|
+
def any_to_format_or_reformat?
|
30
|
+
any_to_format? || any_to_reformat?
|
57
31
|
end
|
58
32
|
|
59
|
-
|
60
|
-
|
61
|
-
#
|
62
|
-
# Note that it is okay to only use V4 databases or only V5 databases.
|
63
|
-
# Incompatibility arises when they are mixed.
|
64
|
-
def any_incompatible?
|
65
|
-
return false if @formatted_fastas.all? { |ff| ff.v4? || ff.alias? }
|
66
|
-
return false if @formatted_fastas.all? { |ff| ff.v5? || ff.alias? }
|
67
|
-
true
|
33
|
+
def no_fastas?
|
34
|
+
probably_fastas.empty?
|
68
35
|
end
|
69
36
|
|
70
|
-
# Runs makeblastdb on each file in
|
71
|
-
#
|
72
|
-
# has been run before.
|
37
|
+
# Runs makeblastdb on each file in `fastas_to_format` and
|
38
|
+
# `fastas_to_reformat`.
|
73
39
|
def run
|
74
40
|
format
|
75
41
|
reformat
|
@@ -80,8 +46,9 @@ module SequenceServer
|
|
80
46
|
def format
|
81
47
|
# Make the intent clear as well as ensure the program won't crash if we
|
82
48
|
# accidentally call format before calling scan.
|
83
|
-
return unless
|
84
|
-
|
49
|
+
return unless any_to_format?
|
50
|
+
|
51
|
+
fastas_to_format.select do |path, title, type|
|
85
52
|
make_blast_database('format', path, title, type)
|
86
53
|
end
|
87
54
|
end
|
@@ -91,50 +58,85 @@ module SequenceServer
|
|
91
58
|
def reformat
|
92
59
|
# Make the intent clear as well as ensure the program won't crash if
|
93
60
|
# we accidentally call reformat before calling scan.
|
94
|
-
return unless
|
95
|
-
|
61
|
+
return unless any_to_reformat?
|
62
|
+
|
63
|
+
fastas_to_reformat.select do |path, title, type, non_parse_seqids|
|
96
64
|
make_blast_database('reformat', path, title, type, non_parse_seqids)
|
97
65
|
end
|
98
66
|
end
|
99
67
|
|
100
|
-
private
|
101
|
-
|
102
68
|
# Determines which FASTA files in the database directory are already
|
103
|
-
# formatted.
|
104
|
-
def
|
69
|
+
# formatted.
|
70
|
+
def formatted_fastas
|
71
|
+
return @formatted_fastas if defined?(@formatted_fastas)
|
72
|
+
|
73
|
+
@formatted_fastas = []
|
74
|
+
|
105
75
|
blastdbcmd.each_line do |line|
|
106
76
|
path, *rest = line.chomp.split("\t")
|
107
77
|
next if multipart_database_name?(path)
|
78
|
+
|
108
79
|
rest << get_categories(path)
|
109
80
|
@formatted_fastas << Database.new(path, *rest)
|
110
81
|
end
|
82
|
+
|
83
|
+
@formatted_fastas
|
84
|
+
end
|
85
|
+
|
86
|
+
def any_to_format?
|
87
|
+
fastas_to_format.any?
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
def any_to_reformat?
|
93
|
+
fastas_to_reformat.any?
|
111
94
|
end
|
112
95
|
|
113
96
|
# Determines which FASTA files in the database directory require
|
114
|
-
# reformatting.
|
115
|
-
def
|
116
|
-
@
|
117
|
-
|
118
|
-
|
119
|
-
|
97
|
+
# reformatting.
|
98
|
+
def fastas_to_reformat
|
99
|
+
return @fastas_to_reformat if defined?(@fastas_to_reformat)
|
100
|
+
|
101
|
+
@fastas_to_reformat = []
|
102
|
+
formatted_fastas.each do |ff|
|
103
|
+
@fastas_to_reformat << [ff.path, ff.title, ff.type, ff.non_parse_seqids?] if ff.v4? || ff.non_parse_seqids?
|
120
104
|
end
|
105
|
+
|
106
|
+
@fastas_to_reformat
|
121
107
|
end
|
122
108
|
|
123
109
|
# Determines which FASTA files in the database directory are
|
124
|
-
# unformatted.
|
125
|
-
def
|
126
|
-
|
127
|
-
|
128
|
-
|
110
|
+
# unformatted.
|
111
|
+
def fastas_to_format
|
112
|
+
return @fastas_to_format if defined?(@fastas_to_format)
|
113
|
+
|
114
|
+
formatted_fasta_paths = formatted_fastas.map { |f| f[0] }
|
115
|
+
fasta_paths_to_format = probably_fastas - formatted_fasta_paths
|
116
|
+
|
117
|
+
@fastas_to_format = fasta_paths_to_format.map do |path|
|
118
|
+
[
|
119
|
+
path,
|
120
|
+
make_db_title(path),
|
121
|
+
guess_sequence_type_in_fasta(path)
|
122
|
+
]
|
123
|
+
end
|
124
|
+
|
125
|
+
@fastas_to_format
|
126
|
+
end
|
127
|
+
|
128
|
+
def probably_fastas
|
129
|
+
return @probably_fastas if defined?(@probably_fastas)
|
130
|
+
|
131
|
+
@probably_fastas = []
|
132
|
+
|
129
133
|
Find.find(database_dir + '/') do |path|
|
130
134
|
next if File.directory?(path)
|
131
|
-
next unless probably_fasta?(path)
|
132
|
-
next if @formatted_fastas.any? { |f| f[0] == path }
|
133
135
|
|
134
|
-
@
|
135
|
-
make_db_title(path),
|
136
|
-
guess_sequence_type_in_fasta(path)]
|
136
|
+
@probably_fastas << path if probably_fasta?(path)
|
137
137
|
end
|
138
|
+
|
139
|
+
@probably_fastas
|
138
140
|
end
|
139
141
|
|
140
142
|
# Runs `blastdbcmd` to determine formatted FASTA files in the database
|
@@ -146,14 +148,16 @@ module SequenceServer
|
|
146
148
|
out, err = sys(cmd, path: config[:bin])
|
147
149
|
errpat = /BLAST Database error/
|
148
150
|
fail BLAST_DATABASE_ERROR.new(cmd, err) if err.match(errpat)
|
149
|
-
|
151
|
+
|
152
|
+
out
|
150
153
|
rescue CommandFailed => e
|
151
|
-
|
154
|
+
raise BLAST_DATABASE_ERROR.new(cmd, e.stderr)
|
152
155
|
end
|
153
156
|
|
154
157
|
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
155
158
|
def make_blast_database(action, file, title, type, non_parse_seqids = false)
|
156
159
|
return unless make_blast_database?(action, file, type)
|
160
|
+
|
157
161
|
title = confirm_database_title(title)
|
158
162
|
extract_fasta(file) unless File.exist?(file)
|
159
163
|
taxonomy = taxid_map(file, non_parse_seqids) || taxid
|
@@ -188,9 +192,10 @@ module SequenceServer
|
|
188
192
|
# using blastdbcmd.
|
189
193
|
def taxid_map(db, non_parse_seqids)
|
190
194
|
return if non_parse_seqids
|
195
|
+
|
191
196
|
taxid_map = db.sub(/#{File.extname(db)}$/, '.taxid_map.txt')
|
192
|
-
extract_taxid_map(db, taxid_map)
|
193
|
-
"-taxid_map #{taxid_map}"
|
197
|
+
extract_taxid_map(db, taxid_map) unless File.exist?(taxid_map)
|
198
|
+
"-taxid_map #{taxid_map}" unless File.zero?(taxid_map)
|
194
199
|
end
|
195
200
|
|
196
201
|
# Get taxid from the user. Returns user input or 0.
|
@@ -211,10 +216,24 @@ module SequenceServer
|
|
211
216
|
cmd = "makeblastdb -parse_seqids -hash_index -in '#{file}'" \
|
212
217
|
" -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
213
218
|
" #{taxonomy}"
|
214
|
-
|
219
|
+
|
220
|
+
output = if File.directory?(file)
|
221
|
+
File.join(file, 'makeblastdb')
|
222
|
+
else
|
223
|
+
"#{file}.makeblastdb"
|
224
|
+
end
|
225
|
+
|
226
|
+
out, err = sys(
|
227
|
+
cmd,
|
228
|
+
path: config[:bin],
|
229
|
+
stderr: [output, 'stderr'].join,
|
230
|
+
stdout: [output, 'stdout'].join
|
231
|
+
)
|
232
|
+
|
215
233
|
puts out.strip
|
216
234
|
puts err.strip
|
217
|
-
|
235
|
+
|
236
|
+
true
|
218
237
|
rescue CommandFailed => e
|
219
238
|
puts <<~MSG
|
220
239
|
Could not create BLAST database for: #{file}
|
@@ -261,7 +280,7 @@ module SequenceServer
|
|
261
280
|
# /home/ben/pd.ben/sequenceserver/db/nr00 => no
|
262
281
|
# /mnt/blast-db/refseq_genomic.100 => yes
|
263
282
|
def multipart_database_name?(db_name)
|
264
|
-
!
|
283
|
+
!db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?
|
265
284
|
end
|
266
285
|
|
267
286
|
def get_categories(path)
|
@@ -273,7 +292,10 @@ module SequenceServer
|
|
273
292
|
|
274
293
|
# Returns true if first character of the file is '>'.
|
275
294
|
def probably_fasta?(file)
|
276
|
-
|
295
|
+
unless file.match(/((cdna)|(cds)|(dna)|(fa)|(faa)|(fas)|(fasta)|(fna)|(genome)|(nt)|(nuc)|(pep)|(prot))$/i)
|
296
|
+
return false
|
297
|
+
end
|
298
|
+
|
277
299
|
File.read(file, 1) == '>'
|
278
300
|
end
|
279
301
|
|
data/lib/sequenceserver/pool.rb
CHANGED
@@ -7,11 +7,8 @@ module SequenceServer
|
|
7
7
|
# own report subclass.
|
8
8
|
class Report
|
9
9
|
class << self
|
10
|
-
# Generates report for the given job. Returns generated report object.
|
11
|
-
#
|
12
|
-
# TODO: Dynamic dispatch.
|
13
10
|
def generate(job)
|
14
|
-
BLAST::Report.new(job)
|
11
|
+
BLAST::Report.new(job).generate
|
15
12
|
end
|
16
13
|
end
|
17
14
|
|
@@ -23,7 +20,6 @@ module SequenceServer
|
|
23
20
|
def initialize(job)
|
24
21
|
@job = job
|
25
22
|
yield if block_given?
|
26
|
-
generate
|
27
23
|
end
|
28
24
|
|
29
25
|
attr_reader :job
|