sequenceserver 2.0.0.rc4 → 2.0.0.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/sequenceserver +10 -27
- data/lib/sequenceserver.rb +6 -0
- data/lib/sequenceserver/blast/job.rb +11 -1
- data/lib/sequenceserver/database.rb +0 -134
- data/lib/sequenceserver/makeblastdb.rb +243 -0
- data/lib/sequenceserver/version.rb +1 -1
- data/public/js/hit.js +1 -1
- data/public/js/search.js +4 -6
- data/public/sequenceserver-report.min.js +1 -1
- data/public/sequenceserver-search.min.js +1 -1
- data/spec/capybara_spec.rb +11 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.ndb +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhr +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nin +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nos +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.not +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.ntf +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nto +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pdb +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pos +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pot +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.ptf +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pto +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pdb +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phr +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pin +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pos +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pot +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.ptf +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pto +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.ndb +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nos +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.not +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.ntf +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nto +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhd +8 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhi +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhr +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nin +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nog +0 -0
- data/spec/database/{sample → v4}/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsd +0 -0
- data/spec/database/{sample → v4}/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsi +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsq +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.txt +8 -0
- data/spec/database/v4/links.rb +23 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta +6449 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phd +1189 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phi +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pog +0 -0
- data/spec/database/{sample → v4}/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psd +0 -0
- data/spec/database/{sample → v4}/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psi +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psq +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phd +9140 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phi +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phr +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pin +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pog +0 -0
- data/spec/database/{sample → v4}/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psd +0 -0
- data/spec/database/{sample → v4}/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psi +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psq +0 -0
- data/spec/database/v4/proteins/uniprot/URL +1 -0
- data/spec/database/v4/si_uniprot_idmap.yml +14180 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta +5486 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhd +473 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhi +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nog +0 -0
- data/spec/database/{sample → v4}/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsd +0 -0
- data/spec/database/{sample → v4}/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsi +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
- data/spec/database_spec.rb +0 -76
- data/spec/makeblastdb_spec.rb +121 -0
- data/views/layout.erb +4 -0
- metadata +66 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b04c2f43509bf7d4a15d555e3bd9cea86d3c7770c7a3e35016f963f877055ecf
|
4
|
+
data.tar.gz: 25678397eacb2c3f445902624baff80a90d2a47ca98d2b703e13de3eca84cf1e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a8e6a4c1b4917d9bd35d07ef8e580e1b75105bfd1709fb892d9ed523da91f8c3a08d50c1c9da304530cd0bb5d77da00a4944c65cd5ce90fc979c9d6099149058
|
7
|
+
data.tar.gz: ef493ec4cf1aef63b2f535251143b34154d0d332a47a1d15668c7a74b139d5e6c2a2f9c33b5c7072ae9c2fbbcf33048fad4e828273c85a199a6ab79874d6e766
|
data/bin/sequenceserver
CHANGED
@@ -68,10 +68,6 @@ begin
|
|
68
68
|
# of threads to use in config file.
|
69
69
|
$ sequenceserver -s -n 16
|
70
70
|
|
71
|
-
# See if you have FASTA files in database dir that haven't
|
72
|
-
# been converted into BLAST database.
|
73
|
-
$ sequenceserver -u
|
74
|
-
|
75
71
|
# Search for FASTA files in database dir that haven't been
|
76
72
|
# converted into BLAST database yet, and convert them.
|
77
73
|
$ sequenceserver -m
|
@@ -135,9 +131,6 @@ begin
|
|
135
131
|
on 'l', 'list_databases',
|
136
132
|
'List BLAST databases'
|
137
133
|
|
138
|
-
on 'u', 'list-unformatted-fastas',
|
139
|
-
'List unformatted FASTA files'
|
140
|
-
|
141
134
|
on 'i', 'interactive',
|
142
135
|
'Run SequenceServer in interactive mode'
|
143
136
|
|
@@ -285,8 +278,7 @@ begin
|
|
285
278
|
fetch_option(:database_dir).value = response
|
286
279
|
redo
|
287
280
|
rescue SequenceServer::NO_BLAST_DATABASE_FOUND => e
|
288
|
-
unless list_databases? ||
|
289
|
-
make_blast_databases?
|
281
|
+
unless list_databases? || make_blast_databases?
|
290
282
|
|
291
283
|
# Print error raised.
|
292
284
|
puts
|
@@ -305,13 +297,13 @@ begin
|
|
305
297
|
unless response =~ /^[n]$/i
|
306
298
|
puts
|
307
299
|
puts 'Searching ...'
|
308
|
-
if SequenceServer
|
309
|
-
|
310
|
-
exit!
|
311
|
-
else
|
312
|
-
formatted = SequenceServer::Database.make_blast_databases
|
300
|
+
if SequenceServer.makeblastdb.scan
|
301
|
+
formatted = SequenceServer.makeblastdb.run
|
313
302
|
exit! if formatted.empty? && !set?
|
314
303
|
redo unless set?
|
304
|
+
else
|
305
|
+
puts "Couldn't find any FASTA files."
|
306
|
+
exit!
|
315
307
|
end
|
316
308
|
else
|
317
309
|
exit! unless set?
|
@@ -361,22 +353,13 @@ begin
|
|
361
353
|
exit
|
362
354
|
end
|
363
355
|
|
364
|
-
if
|
365
|
-
|
366
|
-
|
356
|
+
if make_blast_databases?
|
357
|
+
if SequenceServer.makeblastdb.scan
|
358
|
+
SequenceServer.makeblastdb.run
|
359
|
+
else
|
367
360
|
puts "All FASTA files in #{SequenceServer.config[:database_dir]} " \
|
368
361
|
'are formatted.'
|
369
|
-
exit
|
370
362
|
end
|
371
|
-
end
|
372
|
-
|
373
|
-
if list_unformatted_fastas?
|
374
|
-
puts unformatted_fastas
|
375
|
-
exit
|
376
|
-
end
|
377
|
-
|
378
|
-
if make_blast_databases?
|
379
|
-
SequenceServer::Database.make_blast_databases
|
380
363
|
exit
|
381
364
|
end
|
382
365
|
|
data/lib/sequenceserver.rb
CHANGED
@@ -24,6 +24,7 @@ module SequenceServer
|
|
24
24
|
require 'sequenceserver/config'
|
25
25
|
require 'sequenceserver/server'
|
26
26
|
require 'sequenceserver/routes'
|
27
|
+
require 'sequenceserver/makeblastdb'
|
27
28
|
require 'sequenceserver/job_remover'
|
28
29
|
require 'sequenceserver/exceptions'
|
29
30
|
require 'sequenceserver/sys'
|
@@ -57,6 +58,11 @@ module SequenceServer
|
|
57
58
|
end
|
58
59
|
end
|
59
60
|
|
61
|
+
# MAKEBLASTDB service object.
|
62
|
+
def makeblastdb
|
63
|
+
@makeblastdb ||= MAKEBLASTDB.new(config[:database_dir])
|
64
|
+
end
|
65
|
+
|
60
66
|
# SequenceServer initialisation routine.
|
61
67
|
def init(config = {})
|
62
68
|
# Use default config file if caller didn't specify one.
|
@@ -65,6 +65,16 @@ module SequenceServer
|
|
65
65
|
error = IO.foreach(stderr).grep(ERROR_LINE).join
|
66
66
|
error = File.read(stderr) if error.empty?
|
67
67
|
fail InputError, error
|
68
|
+
when 2
|
69
|
+
fail InputError, <<~MSG
|
70
|
+
BLAST signalled a problem with the databases that you searched.
|
71
|
+
|
72
|
+
Most likely one or more of your databases were created using an
|
73
|
+
older version of BLAST. Please consider recreating the databases
|
74
|
+
using BLAST #{BLAST_VERSION}.
|
75
|
+
|
76
|
+
As a temporary solution, you can try searching one database at a time.
|
77
|
+
MSG
|
68
78
|
when 4
|
69
79
|
# Out of memory. User can retry with a shorter search, so raising
|
70
80
|
# InputError here instead of SystemError.
|
@@ -79,7 +89,7 @@ module SequenceServer
|
|
79
89
|
# the job. This is a SystemError.
|
80
90
|
fail SystemError, 'Ran out of disk space.'
|
81
91
|
else
|
82
|
-
# I am not sure what the exit codes
|
92
|
+
# I am not sure what the exit codes 3 means and we should not
|
83
93
|
# encounter exit code 5. The only other error that I know can happen
|
84
94
|
# but is not yet handled is when BLAST+ binaries break such as after
|
85
95
|
# macOS updates. So raise SystemError, include the exit status in the
|
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'find'
|
2
1
|
require 'open3'
|
3
2
|
require 'digest/md5'
|
4
3
|
require 'forwardable'
|
@@ -209,97 +208,6 @@ module SequenceServer
|
|
209
208
|
end
|
210
209
|
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
211
210
|
|
212
|
-
# Recursively scan `database_dir` for un-formatted FASTA and format them
|
213
|
-
# for use with BLAST+.
|
214
|
-
def make_blast_databases
|
215
|
-
unformatted_fastas.select do |file, sequence_type|
|
216
|
-
make_blast_database(file, sequence_type)
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
# Returns an Array of FASTA files that may require formatting, and the
|
221
|
-
# type of sequence contained in each FASTA.
|
222
|
-
#
|
223
|
-
# > unformatted_fastas
|
224
|
-
# => [['/foo/bar.fasta', :nulceotide], ...]
|
225
|
-
def unformatted_fastas
|
226
|
-
list = []
|
227
|
-
database_dir = config[:database_dir]
|
228
|
-
Find.find database_dir do |file|
|
229
|
-
next if File.directory? file
|
230
|
-
next if Database.include? file
|
231
|
-
next unless probably_fasta? file
|
232
|
-
sequence_type = guess_sequence_type_in_fasta file
|
233
|
-
if %i[protein nucleotide].include?(sequence_type)
|
234
|
-
list << [file, sequence_type]
|
235
|
-
end
|
236
|
-
end
|
237
|
-
list
|
238
|
-
end
|
239
|
-
|
240
|
-
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
241
|
-
def make_blast_database(file, type)
|
242
|
-
return unless make_blast_database? file, type
|
243
|
-
title = get_database_title(file)
|
244
|
-
taxid = fetch_tax_id
|
245
|
-
_make_blast_database(file, type, title, taxid)
|
246
|
-
end
|
247
|
-
|
248
|
-
def _make_blast_database(file, type, title, taxid, quiet = false)
|
249
|
-
cmd = 'makeblastdb -parse_seqids -hash_index ' \
|
250
|
-
"-in #{file} -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
251
|
-
" -taxid #{taxid}"
|
252
|
-
out, err = sys(cmd, path: config[:bin])
|
253
|
-
puts out, err unless quiet
|
254
|
-
rescue CommandFailed => e
|
255
|
-
puts <<~MSG
|
256
|
-
Could not create BLAST database for: #{file}
|
257
|
-
Tried: #{cmd}
|
258
|
-
stdout: #{e.stdout}
|
259
|
-
stderr: #{e.stderr}
|
260
|
-
MSG
|
261
|
-
exit!
|
262
|
-
end
|
263
|
-
|
264
|
-
# Show file path and guessed sequence type to the user and obtain a y/n
|
265
|
-
# response.
|
266
|
-
#
|
267
|
-
# Returns true if the user entered anything but 'n' or 'N'.
|
268
|
-
def make_blast_database?(file, type)
|
269
|
-
puts
|
270
|
-
puts
|
271
|
-
puts "FASTA file: #{file}"
|
272
|
-
puts "FASTA type: #{type}"
|
273
|
-
print 'Proceed? [y/n] (Default: y): '
|
274
|
-
response = STDIN.gets.to_s.strip
|
275
|
-
!response.match(/n/i)
|
276
|
-
end
|
277
|
-
|
278
|
-
# Generate a title for the given database and show it to the user for
|
279
|
-
# confirmation.
|
280
|
-
#
|
281
|
-
# Returns user input if any. Auto-generated title otherwise.
|
282
|
-
def get_database_title(path)
|
283
|
-
default = make_db_title(File.basename(path))
|
284
|
-
print "Enter a database title or will use '#{default}': "
|
285
|
-
from_user = STDIN.gets.to_s.strip
|
286
|
-
from_user.empty? && default || from_user
|
287
|
-
end
|
288
|
-
|
289
|
-
# Get taxid from the user. Returns user input or 0.
|
290
|
-
#
|
291
|
-
# Using 0 as taxid is equivalent to not setting taxid for the database
|
292
|
-
# that will be created.
|
293
|
-
def fetch_tax_id
|
294
|
-
default = 0
|
295
|
-
print 'Enter taxid (optional): '
|
296
|
-
user_response = STDIN.gets.strip
|
297
|
-
user_response.empty? && default || Integer(user_response)
|
298
|
-
rescue
|
299
|
-
puts 'taxid should be a number'
|
300
|
-
retry
|
301
|
-
end
|
302
|
-
|
303
211
|
# Returns true if the database name appears to be a multi-part database
|
304
212
|
# name.
|
305
213
|
#
|
@@ -312,48 +220,6 @@ module SequenceServer
|
|
312
220
|
def multipart_database_name?(db_name)
|
313
221
|
!(db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?)
|
314
222
|
end
|
315
|
-
|
316
|
-
# Returns true if first character of the file is '>'.
|
317
|
-
def probably_fasta?(file)
|
318
|
-
File.read(file, 1) == '>'
|
319
|
-
end
|
320
|
-
|
321
|
-
# Suggests improved titles when generating database names from files
|
322
|
-
# for improved apperance and readability in web interface.
|
323
|
-
# For example:
|
324
|
-
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
|
325
|
-
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
|
326
|
-
def make_db_title(db_name)
|
327
|
-
db_name.tr!('"', "'")
|
328
|
-
# removes .fasta like extension names
|
329
|
-
db_name.gsub!(File.extname(db_name), '')
|
330
|
-
# replaces _ with ' ',
|
331
|
-
db_name.gsub!(/(_)/, ' ')
|
332
|
-
# replaces '.' with ' ' when no numbers are on either side,
|
333
|
-
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
|
334
|
-
# preserves version numbers
|
335
|
-
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
|
336
|
-
db_name
|
337
|
-
end
|
338
|
-
|
339
|
-
# Guess whether FASTA file contains protein or nucleotide sequences by
|
340
|
-
# sampling a few few characters of the file.
|
341
|
-
def guess_sequence_type_in_fasta(file)
|
342
|
-
sequences = sample_sequences(file)
|
343
|
-
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
|
344
|
-
sequence_types = sequence_types.uniq.compact
|
345
|
-
(sequence_types.length == 1) && sequence_types.first
|
346
|
-
end
|
347
|
-
|
348
|
-
# Read first 1,048,576 characters of the file, split the read text on
|
349
|
-
# fasta def line pattern and return.
|
350
|
-
#
|
351
|
-
# If the given file is FASTA, returns Array of as many different
|
352
|
-
# sequences in the portion of the file read. Returns the portion
|
353
|
-
# of the file read wrapped in an Array otherwise.
|
354
|
-
def sample_sequences(file)
|
355
|
-
File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
|
356
|
-
end
|
357
223
|
end
|
358
224
|
end
|
359
225
|
end
|
@@ -0,0 +1,243 @@
|
|
1
|
+
require 'find'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module SequenceServer
|
5
|
+
# Smart `makeblastdb` wrapper: recursively scans database directory determining
|
6
|
+
# which files need to be formatted or re-formatted.
|
7
|
+
#
|
8
|
+
# Example usage:
|
9
|
+
#
|
10
|
+
# makeblastdb = MAKEBLASTDB.new(database_dir)
|
11
|
+
# makeblastdb.scan && makeblastdb.run
|
12
|
+
#
|
13
|
+
class MAKEBLASTDB
|
14
|
+
# We want V5 databases created using -parse_seqids for proper function of
|
15
|
+
# SequenceServer. This means each database should be comprised of at least 9
|
16
|
+
# files with the following extensions. Databases created by us will have two
|
17
|
+
# additional files with the extensions nhd and nhi, or phd and phi, due to
|
18
|
+
# the use of -hash_index option. Finally, multipart databases will have one
|
19
|
+
# additional file with the extension nal or pal.
|
20
|
+
REQUIRED_EXTENSIONS = {
|
21
|
+
'nucleotide' => %w{ndb nhr nin nog nos not nsq ntf nto}.freeze,
|
22
|
+
'protein' => %w{pdb phr pin pog pos pot psq ptf pto}.freeze
|
23
|
+
}
|
24
|
+
|
25
|
+
extend Forwardable
|
26
|
+
|
27
|
+
def_delegators SequenceServer, :config, :sys
|
28
|
+
|
29
|
+
def initialize(database_dir)
|
30
|
+
@database_dir = database_dir
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_reader :database_dir
|
34
|
+
|
35
|
+
# Scans the database directory to determine which FASTA files require
|
36
|
+
# formatting or re-formatting.
|
37
|
+
#
|
38
|
+
# Returns `true` if there are files to (re-)format, `false` otherwise.
|
39
|
+
def scan
|
40
|
+
# We need to know the list of formatted FASTAs as reported by blastdbcmd
|
41
|
+
# first. This is required to determine both unformatted FASTAs and those
|
42
|
+
# that require reformatting.
|
43
|
+
@formatted_fastas = []
|
44
|
+
determine_formatted_fastas
|
45
|
+
|
46
|
+
# Now determine FASTA files that are unformatted or require reformatting.
|
47
|
+
@fastas_to_format = []
|
48
|
+
determine_unformatted_fastas
|
49
|
+
determine_fastas_to_reformat
|
50
|
+
|
51
|
+
# Return true if there are files to be (re-)formatted or false otherwise.
|
52
|
+
!@fastas_to_format.empty?
|
53
|
+
end
|
54
|
+
|
55
|
+
# Runs makeblastdb on each file in `@fastas_to_format`. Will do nothing
|
56
|
+
# unless `#scan` has been run before.
|
57
|
+
def run
|
58
|
+
return unless @fastas_to_format || @fastas_to_format.empty?
|
59
|
+
@fastas_to_format.each do |path, title, type|
|
60
|
+
make_blast_database(path, title, type)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
# Determines which FASTA files in the database directory are already
|
67
|
+
# formatted. Adds to @formatted_fastas.
|
68
|
+
def determine_formatted_fastas
|
69
|
+
blastdbcmd.each_line do |line|
|
70
|
+
path, title, type = line.split(' ')
|
71
|
+
next if multipart_database_name?(path)
|
72
|
+
@formatted_fastas << [path, title, type.strip.downcase]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Determines which FASTA files in the database directory require
|
77
|
+
# reformatting. Adds to @fastas_to_format.
|
78
|
+
def determine_fastas_to_reformat
|
79
|
+
@formatted_fastas.each do |path, title, type|
|
80
|
+
required_extensions = REQUIRED_EXTENSIONS[type]
|
81
|
+
exts = Dir["#{path}.*"].map { |p| p.split('.').last }.sort
|
82
|
+
next if (exts & required_extensions) == required_extensions
|
83
|
+
|
84
|
+
@fastas_to_format << [path, title, type]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Determines which FASTA files in the database directory are
|
89
|
+
# unformatted. Adds to @fastas_to_format.
|
90
|
+
def determine_unformatted_fastas
|
91
|
+
Find.find(database_dir) do |path|
|
92
|
+
next if File.directory?(path)
|
93
|
+
next unless probably_fasta?(path)
|
94
|
+
next if @formatted_fastas.any? { |f| f[0] == path }
|
95
|
+
|
96
|
+
@fastas_to_format << [path,
|
97
|
+
make_db_title(File.basename(path)),
|
98
|
+
guess_sequence_type_in_fasta(path)]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Runs `blastdbcmd` to determine formatted FASTA files in the database
|
103
|
+
# directory. Returns the output of `blastdbcmd`. This method is called
|
104
|
+
# by `determine_formatted_fastas`.
|
105
|
+
def blastdbcmd
|
106
|
+
cmd = "blastdbcmd -recursive -list #{database_dir}" \
|
107
|
+
' -list_outfmt "%f %t %p"'
|
108
|
+
out, _ = sys(cmd, path: config[:bin])
|
109
|
+
out
|
110
|
+
end
|
111
|
+
|
112
|
+
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
113
|
+
def make_blast_database(file, title, type)
|
114
|
+
return unless make_blast_database? file, type
|
115
|
+
title = confirm_database_title(title)
|
116
|
+
taxid = fetch_tax_id
|
117
|
+
_make_blast_database(file, type, title, taxid)
|
118
|
+
end
|
119
|
+
|
120
|
+
def _make_blast_database(file, type, title, taxid)
|
121
|
+
extract_fasta(file) unless File.exist?(file)
|
122
|
+
cmd = "makeblastdb -parse_seqids -hash_index -in #{file} " \
|
123
|
+
"-dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
124
|
+
" -taxid #{taxid}"
|
125
|
+
out, err = sys(cmd, path: config[:bin])
|
126
|
+
puts out.strip
|
127
|
+
puts err.strip
|
128
|
+
rescue CommandFailed => e
|
129
|
+
puts <<~MSG
|
130
|
+
Could not create BLAST database for: #{file}
|
131
|
+
Tried: #{cmd}
|
132
|
+
stdout: #{e.stdout}
|
133
|
+
stderr: #{e.stderr}
|
134
|
+
MSG
|
135
|
+
exit!
|
136
|
+
end
|
137
|
+
|
138
|
+
# Show file path and guessed sequence type to the user and obtain a y/n
|
139
|
+
# response.
|
140
|
+
#
|
141
|
+
# Returns true if the user entered anything but 'n' or 'N'.
|
142
|
+
def make_blast_database?(file, type)
|
143
|
+
puts
|
144
|
+
puts
|
145
|
+
puts "FASTA file to format/reformat: #{file}"
|
146
|
+
puts "FASTA type: #{type}"
|
147
|
+
print 'Proceed? [y/n] (Default: y): '
|
148
|
+
response = STDIN.gets.to_s.strip
|
149
|
+
!response.match(/n/i)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Show the database title that we are going to use to the user for
|
153
|
+
# confirmation.
|
154
|
+
#
|
155
|
+
# Returns user input if any. Auto-determined title otherwise.
|
156
|
+
def confirm_database_title(default)
|
157
|
+
print "Enter a database title or will use '#{default}': "
|
158
|
+
from_user = STDIN.gets.to_s.strip
|
159
|
+
from_user.empty? && default || from_user
|
160
|
+
end
|
161
|
+
|
162
|
+
# Get taxid from the user. Returns user input or 0.
|
163
|
+
#
|
164
|
+
# Using 0 as taxid is equivalent to not setting taxid for the database
|
165
|
+
# that will be created.
|
166
|
+
def fetch_tax_id
|
167
|
+
default = 0
|
168
|
+
print 'Enter taxid (optional): '
|
169
|
+
user_response = STDIN.gets.strip
|
170
|
+
user_response.empty? && default || Integer(user_response)
|
171
|
+
rescue
|
172
|
+
puts 'taxid should be a number'
|
173
|
+
retry
|
174
|
+
end
|
175
|
+
|
176
|
+
# Extract FASTA file from BLAST database.
|
177
|
+
#
|
178
|
+
# Invoked while reformatting a BLAST database if the corresponding
|
179
|
+
# FASTA file does not exist.
|
180
|
+
def extract_fasta(db)
|
181
|
+
puts
|
182
|
+
puts 'Extracting sequences ...'
|
183
|
+
cmd = "blastdbcmd -entry all -db #{db}"
|
184
|
+
sys(cmd, stdout: db, path: config[:bin])
|
185
|
+
rescue CommandFailed => e
|
186
|
+
puts <<~MSG
|
187
|
+
Could not extract sequences from: #{db}
|
188
|
+
Tried: #{cmd}
|
189
|
+
stdout: #{e.stdout}
|
190
|
+
stderr: #{e.stderr}
|
191
|
+
MSG
|
192
|
+
exit!
|
193
|
+
end
|
194
|
+
|
195
|
+
# Returns true if the database name appears to be a multi-part database
|
196
|
+
# name.
|
197
|
+
def multipart_database_name?(db_name)
|
198
|
+
Database.multipart_database_name? db_name
|
199
|
+
end
|
200
|
+
|
201
|
+
# Returns true if first character of the file is '>'.
|
202
|
+
def probably_fasta?(file)
|
203
|
+
File.read(file, 1) == '>'
|
204
|
+
end
|
205
|
+
|
206
|
+
# Suggests improved titles when generating database names from files
|
207
|
+
# for improved apperance and readability in web interface.
|
208
|
+
# For example:
|
209
|
+
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
|
210
|
+
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
|
211
|
+
def make_db_title(db_name)
|
212
|
+
db_name.tr!('"', "'")
|
213
|
+
# removes .fasta like extension names
|
214
|
+
db_name.gsub!(File.extname(db_name), '')
|
215
|
+
# replaces _ with ' ',
|
216
|
+
db_name.gsub!(/(_)/, ' ')
|
217
|
+
# replaces '.' with ' ' when no numbers are on either side,
|
218
|
+
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
|
219
|
+
# preserves version numbers
|
220
|
+
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
|
221
|
+
db_name
|
222
|
+
end
|
223
|
+
|
224
|
+
# Guess whether FASTA file contains protein or nucleotide sequences by
|
225
|
+
# sampling a few few characters of the file.
|
226
|
+
def guess_sequence_type_in_fasta(file)
|
227
|
+
sequences = sample_sequences(file)
|
228
|
+
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
|
229
|
+
sequence_types = sequence_types.uniq.compact
|
230
|
+
(sequence_types.length == 1) && sequence_types.first
|
231
|
+
end
|
232
|
+
|
233
|
+
# Read first 1,048,576 characters of the file, split the read text on
|
234
|
+
# fasta def line pattern and return.
|
235
|
+
#
|
236
|
+
# If the given file is FASTA, returns Array of as many different
|
237
|
+
# sequences in the portion of the file read. Returns the portion
|
238
|
+
# of the file read wrapped in an Array otherwise.
|
239
|
+
def sample_sequences(file)
|
240
|
+
File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|