sequenceserver 2.0.0.rc4 → 2.0.0.rc5
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of sequenceserver might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/bin/sequenceserver +10 -27
- data/lib/sequenceserver.rb +6 -0
- data/lib/sequenceserver/blast/job.rb +11 -1
- data/lib/sequenceserver/database.rb +0 -134
- data/lib/sequenceserver/makeblastdb.rb +243 -0
- data/lib/sequenceserver/version.rb +1 -1
- data/public/js/hit.js +1 -1
- data/public/js/search.js +4 -6
- data/public/sequenceserver-report.min.js +1 -1
- data/public/sequenceserver-search.min.js +1 -1
- data/spec/capybara_spec.rb +11 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.ndb +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhr +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nin +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nos +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.not +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.ntf +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nto +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pdb +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pos +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pot +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.ptf +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pto +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pdb +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phr +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pin +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pos +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pot +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.ptf +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pto +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.ndb +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nos +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.not +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.ntf +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nto +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhd +8 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhi +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhr +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nin +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nog +0 -0
- data/spec/database/{sample → v4}/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsd +0 -0
- data/spec/database/{sample → v4}/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsi +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsq +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.txt +8 -0
- data/spec/database/v4/links.rb +23 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta +6449 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phd +1189 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phi +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pog +0 -0
- data/spec/database/{sample → v4}/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psd +0 -0
- data/spec/database/{sample → v4}/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psi +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psq +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phd +9140 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phi +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phr +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pin +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pog +0 -0
- data/spec/database/{sample → v4}/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psd +0 -0
- data/spec/database/{sample → v4}/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psi +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psq +0 -0
- data/spec/database/v4/proteins/uniprot/URL +1 -0
- data/spec/database/v4/si_uniprot_idmap.yml +14180 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta +5486 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhd +473 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhi +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nog +0 -0
- data/spec/database/{sample → v4}/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsd +0 -0
- data/spec/database/{sample → v4}/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsi +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
- data/spec/database_spec.rb +0 -76
- data/spec/makeblastdb_spec.rb +121 -0
- data/views/layout.erb +4 -0
- metadata +66 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b04c2f43509bf7d4a15d555e3bd9cea86d3c7770c7a3e35016f963f877055ecf
|
4
|
+
data.tar.gz: 25678397eacb2c3f445902624baff80a90d2a47ca98d2b703e13de3eca84cf1e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a8e6a4c1b4917d9bd35d07ef8e580e1b75105bfd1709fb892d9ed523da91f8c3a08d50c1c9da304530cd0bb5d77da00a4944c65cd5ce90fc979c9d6099149058
|
7
|
+
data.tar.gz: ef493ec4cf1aef63b2f535251143b34154d0d332a47a1d15668c7a74b139d5e6c2a2f9c33b5c7072ae9c2fbbcf33048fad4e828273c85a199a6ab79874d6e766
|
data/bin/sequenceserver
CHANGED
@@ -68,10 +68,6 @@ begin
|
|
68
68
|
# of threads to use in config file.
|
69
69
|
$ sequenceserver -s -n 16
|
70
70
|
|
71
|
-
# See if you have FASTA files in database dir that haven't
|
72
|
-
# been converted into BLAST database.
|
73
|
-
$ sequenceserver -u
|
74
|
-
|
75
71
|
# Search for FASTA files in database dir that haven't been
|
76
72
|
# converted into BLAST database yet, and convert them.
|
77
73
|
$ sequenceserver -m
|
@@ -135,9 +131,6 @@ begin
|
|
135
131
|
on 'l', 'list_databases',
|
136
132
|
'List BLAST databases'
|
137
133
|
|
138
|
-
on 'u', 'list-unformatted-fastas',
|
139
|
-
'List unformatted FASTA files'
|
140
|
-
|
141
134
|
on 'i', 'interactive',
|
142
135
|
'Run SequenceServer in interactive mode'
|
143
136
|
|
@@ -285,8 +278,7 @@ begin
|
|
285
278
|
fetch_option(:database_dir).value = response
|
286
279
|
redo
|
287
280
|
rescue SequenceServer::NO_BLAST_DATABASE_FOUND => e
|
288
|
-
unless list_databases? ||
|
289
|
-
make_blast_databases?
|
281
|
+
unless list_databases? || make_blast_databases?
|
290
282
|
|
291
283
|
# Print error raised.
|
292
284
|
puts
|
@@ -305,13 +297,13 @@ begin
|
|
305
297
|
unless response =~ /^[n]$/i
|
306
298
|
puts
|
307
299
|
puts 'Searching ...'
|
308
|
-
if SequenceServer
|
309
|
-
|
310
|
-
exit!
|
311
|
-
else
|
312
|
-
formatted = SequenceServer::Database.make_blast_databases
|
300
|
+
if SequenceServer.makeblastdb.scan
|
301
|
+
formatted = SequenceServer.makeblastdb.run
|
313
302
|
exit! if formatted.empty? && !set?
|
314
303
|
redo unless set?
|
304
|
+
else
|
305
|
+
puts "Couldn't find any FASTA files."
|
306
|
+
exit!
|
315
307
|
end
|
316
308
|
else
|
317
309
|
exit! unless set?
|
@@ -361,22 +353,13 @@ begin
|
|
361
353
|
exit
|
362
354
|
end
|
363
355
|
|
364
|
-
if
|
365
|
-
|
366
|
-
|
356
|
+
if make_blast_databases?
|
357
|
+
if SequenceServer.makeblastdb.scan
|
358
|
+
SequenceServer.makeblastdb.run
|
359
|
+
else
|
367
360
|
puts "All FASTA files in #{SequenceServer.config[:database_dir]} " \
|
368
361
|
'are formatted.'
|
369
|
-
exit
|
370
362
|
end
|
371
|
-
end
|
372
|
-
|
373
|
-
if list_unformatted_fastas?
|
374
|
-
puts unformatted_fastas
|
375
|
-
exit
|
376
|
-
end
|
377
|
-
|
378
|
-
if make_blast_databases?
|
379
|
-
SequenceServer::Database.make_blast_databases
|
380
363
|
exit
|
381
364
|
end
|
382
365
|
|
data/lib/sequenceserver.rb
CHANGED
@@ -24,6 +24,7 @@ module SequenceServer
|
|
24
24
|
require 'sequenceserver/config'
|
25
25
|
require 'sequenceserver/server'
|
26
26
|
require 'sequenceserver/routes'
|
27
|
+
require 'sequenceserver/makeblastdb'
|
27
28
|
require 'sequenceserver/job_remover'
|
28
29
|
require 'sequenceserver/exceptions'
|
29
30
|
require 'sequenceserver/sys'
|
@@ -57,6 +58,11 @@ module SequenceServer
|
|
57
58
|
end
|
58
59
|
end
|
59
60
|
|
61
|
+
# MAKEBLASTDB service object.
|
62
|
+
def makeblastdb
|
63
|
+
@makeblastdb ||= MAKEBLASTDB.new(config[:database_dir])
|
64
|
+
end
|
65
|
+
|
60
66
|
# SequenceServer initialisation routine.
|
61
67
|
def init(config = {})
|
62
68
|
# Use default config file if caller didn't specify one.
|
@@ -65,6 +65,16 @@ module SequenceServer
|
|
65
65
|
error = IO.foreach(stderr).grep(ERROR_LINE).join
|
66
66
|
error = File.read(stderr) if error.empty?
|
67
67
|
fail InputError, error
|
68
|
+
when 2
|
69
|
+
fail InputError, <<~MSG
|
70
|
+
BLAST signalled a problem with the databases that you searched.
|
71
|
+
|
72
|
+
Most likely one or more of your databases were created using an
|
73
|
+
older version of BLAST. Please consider recreating the databases
|
74
|
+
using BLAST #{BLAST_VERSION}.
|
75
|
+
|
76
|
+
As a temporary solution, you can try searching one database at a time.
|
77
|
+
MSG
|
68
78
|
when 4
|
69
79
|
# Out of memory. User can retry with a shorter search, so raising
|
70
80
|
# InputError here instead of SystemError.
|
@@ -79,7 +89,7 @@ module SequenceServer
|
|
79
89
|
# the job. This is a SystemError.
|
80
90
|
fail SystemError, 'Ran out of disk space.'
|
81
91
|
else
|
82
|
-
# I am not sure what the exit codes
|
92
|
+
# I am not sure what the exit codes 3 means and we should not
|
83
93
|
# encounter exit code 5. The only other error that I know can happen
|
84
94
|
# but is not yet handled is when BLAST+ binaries break such as after
|
85
95
|
# macOS updates. So raise SystemError, include the exit status in the
|
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'find'
|
2
1
|
require 'open3'
|
3
2
|
require 'digest/md5'
|
4
3
|
require 'forwardable'
|
@@ -209,97 +208,6 @@ module SequenceServer
|
|
209
208
|
end
|
210
209
|
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
211
210
|
|
212
|
-
# Recursively scan `database_dir` for un-formatted FASTA and format them
|
213
|
-
# for use with BLAST+.
|
214
|
-
def make_blast_databases
|
215
|
-
unformatted_fastas.select do |file, sequence_type|
|
216
|
-
make_blast_database(file, sequence_type)
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
# Returns an Array of FASTA files that may require formatting, and the
|
221
|
-
# type of sequence contained in each FASTA.
|
222
|
-
#
|
223
|
-
# > unformatted_fastas
|
224
|
-
# => [['/foo/bar.fasta', :nulceotide], ...]
|
225
|
-
def unformatted_fastas
|
226
|
-
list = []
|
227
|
-
database_dir = config[:database_dir]
|
228
|
-
Find.find database_dir do |file|
|
229
|
-
next if File.directory? file
|
230
|
-
next if Database.include? file
|
231
|
-
next unless probably_fasta? file
|
232
|
-
sequence_type = guess_sequence_type_in_fasta file
|
233
|
-
if %i[protein nucleotide].include?(sequence_type)
|
234
|
-
list << [file, sequence_type]
|
235
|
-
end
|
236
|
-
end
|
237
|
-
list
|
238
|
-
end
|
239
|
-
|
240
|
-
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
241
|
-
def make_blast_database(file, type)
|
242
|
-
return unless make_blast_database? file, type
|
243
|
-
title = get_database_title(file)
|
244
|
-
taxid = fetch_tax_id
|
245
|
-
_make_blast_database(file, type, title, taxid)
|
246
|
-
end
|
247
|
-
|
248
|
-
def _make_blast_database(file, type, title, taxid, quiet = false)
|
249
|
-
cmd = 'makeblastdb -parse_seqids -hash_index ' \
|
250
|
-
"-in #{file} -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
251
|
-
" -taxid #{taxid}"
|
252
|
-
out, err = sys(cmd, path: config[:bin])
|
253
|
-
puts out, err unless quiet
|
254
|
-
rescue CommandFailed => e
|
255
|
-
puts <<~MSG
|
256
|
-
Could not create BLAST database for: #{file}
|
257
|
-
Tried: #{cmd}
|
258
|
-
stdout: #{e.stdout}
|
259
|
-
stderr: #{e.stderr}
|
260
|
-
MSG
|
261
|
-
exit!
|
262
|
-
end
|
263
|
-
|
264
|
-
# Show file path and guessed sequence type to the user and obtain a y/n
|
265
|
-
# response.
|
266
|
-
#
|
267
|
-
# Returns true if the user entered anything but 'n' or 'N'.
|
268
|
-
def make_blast_database?(file, type)
|
269
|
-
puts
|
270
|
-
puts
|
271
|
-
puts "FASTA file: #{file}"
|
272
|
-
puts "FASTA type: #{type}"
|
273
|
-
print 'Proceed? [y/n] (Default: y): '
|
274
|
-
response = STDIN.gets.to_s.strip
|
275
|
-
!response.match(/n/i)
|
276
|
-
end
|
277
|
-
|
278
|
-
# Generate a title for the given database and show it to the user for
|
279
|
-
# confirmation.
|
280
|
-
#
|
281
|
-
# Returns user input if any. Auto-generated title otherwise.
|
282
|
-
def get_database_title(path)
|
283
|
-
default = make_db_title(File.basename(path))
|
284
|
-
print "Enter a database title or will use '#{default}': "
|
285
|
-
from_user = STDIN.gets.to_s.strip
|
286
|
-
from_user.empty? && default || from_user
|
287
|
-
end
|
288
|
-
|
289
|
-
# Get taxid from the user. Returns user input or 0.
|
290
|
-
#
|
291
|
-
# Using 0 as taxid is equivalent to not setting taxid for the database
|
292
|
-
# that will be created.
|
293
|
-
def fetch_tax_id
|
294
|
-
default = 0
|
295
|
-
print 'Enter taxid (optional): '
|
296
|
-
user_response = STDIN.gets.strip
|
297
|
-
user_response.empty? && default || Integer(user_response)
|
298
|
-
rescue
|
299
|
-
puts 'taxid should be a number'
|
300
|
-
retry
|
301
|
-
end
|
302
|
-
|
303
211
|
# Returns true if the database name appears to be a multi-part database
|
304
212
|
# name.
|
305
213
|
#
|
@@ -312,48 +220,6 @@ module SequenceServer
|
|
312
220
|
def multipart_database_name?(db_name)
|
313
221
|
!(db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?)
|
314
222
|
end
|
315
|
-
|
316
|
-
# Returns true if first character of the file is '>'.
|
317
|
-
def probably_fasta?(file)
|
318
|
-
File.read(file, 1) == '>'
|
319
|
-
end
|
320
|
-
|
321
|
-
# Suggests improved titles when generating database names from files
|
322
|
-
# for improved apperance and readability in web interface.
|
323
|
-
# For example:
|
324
|
-
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
|
325
|
-
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
|
326
|
-
def make_db_title(db_name)
|
327
|
-
db_name.tr!('"', "'")
|
328
|
-
# removes .fasta like extension names
|
329
|
-
db_name.gsub!(File.extname(db_name), '')
|
330
|
-
# replaces _ with ' ',
|
331
|
-
db_name.gsub!(/(_)/, ' ')
|
332
|
-
# replaces '.' with ' ' when no numbers are on either side,
|
333
|
-
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
|
334
|
-
# preserves version numbers
|
335
|
-
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
|
336
|
-
db_name
|
337
|
-
end
|
338
|
-
|
339
|
-
# Guess whether FASTA file contains protein or nucleotide sequences by
|
340
|
-
# sampling a few few characters of the file.
|
341
|
-
def guess_sequence_type_in_fasta(file)
|
342
|
-
sequences = sample_sequences(file)
|
343
|
-
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
|
344
|
-
sequence_types = sequence_types.uniq.compact
|
345
|
-
(sequence_types.length == 1) && sequence_types.first
|
346
|
-
end
|
347
|
-
|
348
|
-
# Read first 1,048,576 characters of the file, split the read text on
|
349
|
-
# fasta def line pattern and return.
|
350
|
-
#
|
351
|
-
# If the given file is FASTA, returns Array of as many different
|
352
|
-
# sequences in the portion of the file read. Returns the portion
|
353
|
-
# of the file read wrapped in an Array otherwise.
|
354
|
-
def sample_sequences(file)
|
355
|
-
File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
|
356
|
-
end
|
357
223
|
end
|
358
224
|
end
|
359
225
|
end
|
@@ -0,0 +1,243 @@
|
|
1
|
+
require 'find'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module SequenceServer
|
5
|
+
# Smart `makeblastdb` wrapper: recursively scans database directory determining
|
6
|
+
# which files need to be formatted or re-formatted.
|
7
|
+
#
|
8
|
+
# Example usage:
|
9
|
+
#
|
10
|
+
# makeblastdb = MAKEBLASTDB.new(database_dir)
|
11
|
+
# makeblastdb.scan && makeblastdb.run
|
12
|
+
#
|
13
|
+
class MAKEBLASTDB
|
14
|
+
# We want V5 databases created using -parse_seqids for proper function of
|
15
|
+
# SequenceServer. This means each database should be comprised of at least 9
|
16
|
+
# files with the following extensions. Databases created by us will have two
|
17
|
+
# additional files with the extensions nhd and nhi, or phd and phi, due to
|
18
|
+
# the use of -hash_index option. Finally, multipart databases will have one
|
19
|
+
# additional file with the extension nal or pal.
|
20
|
+
REQUIRED_EXTENSIONS = {
|
21
|
+
'nucleotide' => %w{ndb nhr nin nog nos not nsq ntf nto}.freeze,
|
22
|
+
'protein' => %w{pdb phr pin pog pos pot psq ptf pto}.freeze
|
23
|
+
}
|
24
|
+
|
25
|
+
extend Forwardable
|
26
|
+
|
27
|
+
def_delegators SequenceServer, :config, :sys
|
28
|
+
|
29
|
+
def initialize(database_dir)
|
30
|
+
@database_dir = database_dir
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_reader :database_dir
|
34
|
+
|
35
|
+
# Scans the database directory to determine which FASTA files require
|
36
|
+
# formatting or re-formatting.
|
37
|
+
#
|
38
|
+
# Returns `true` if there are files to (re-)format, `false` otherwise.
|
39
|
+
def scan
|
40
|
+
# We need to know the list of formatted FASTAs as reported by blastdbcmd
|
41
|
+
# first. This is required to determine both unformatted FASTAs and those
|
42
|
+
# that require reformatting.
|
43
|
+
@formatted_fastas = []
|
44
|
+
determine_formatted_fastas
|
45
|
+
|
46
|
+
# Now determine FASTA files that are unformatted or require reformatting.
|
47
|
+
@fastas_to_format = []
|
48
|
+
determine_unformatted_fastas
|
49
|
+
determine_fastas_to_reformat
|
50
|
+
|
51
|
+
# Return true if there are files to be (re-)formatted or false otherwise.
|
52
|
+
!@fastas_to_format.empty?
|
53
|
+
end
|
54
|
+
|
55
|
+
# Runs makeblastdb on each file in `@fastas_to_format`. Will do nothing
|
56
|
+
# unless `#scan` has been run before.
|
57
|
+
def run
|
58
|
+
return unless @fastas_to_format || @fastas_to_format.empty?
|
59
|
+
@fastas_to_format.each do |path, title, type|
|
60
|
+
make_blast_database(path, title, type)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
# Determines which FASTA files in the database directory are already
|
67
|
+
# formatted. Adds to @formatted_fastas.
|
68
|
+
def determine_formatted_fastas
|
69
|
+
blastdbcmd.each_line do |line|
|
70
|
+
path, title, type = line.split(' ')
|
71
|
+
next if multipart_database_name?(path)
|
72
|
+
@formatted_fastas << [path, title, type.strip.downcase]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Determines which FASTA files in the database directory require
|
77
|
+
# reformatting. Adds to @fastas_to_format.
|
78
|
+
def determine_fastas_to_reformat
|
79
|
+
@formatted_fastas.each do |path, title, type|
|
80
|
+
required_extensions = REQUIRED_EXTENSIONS[type]
|
81
|
+
exts = Dir["#{path}.*"].map { |p| p.split('.').last }.sort
|
82
|
+
next if (exts & required_extensions) == required_extensions
|
83
|
+
|
84
|
+
@fastas_to_format << [path, title, type]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Determines which FASTA files in the database directory are
|
89
|
+
# unformatted. Adds to @fastas_to_format.
|
90
|
+
def determine_unformatted_fastas
|
91
|
+
Find.find(database_dir) do |path|
|
92
|
+
next if File.directory?(path)
|
93
|
+
next unless probably_fasta?(path)
|
94
|
+
next if @formatted_fastas.any? { |f| f[0] == path }
|
95
|
+
|
96
|
+
@fastas_to_format << [path,
|
97
|
+
make_db_title(File.basename(path)),
|
98
|
+
guess_sequence_type_in_fasta(path)]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Runs `blastdbcmd` to determine formatted FASTA files in the database
|
103
|
+
# directory. Returns the output of `blastdbcmd`. This method is called
|
104
|
+
# by `determine_formatted_fastas`.
|
105
|
+
def blastdbcmd
|
106
|
+
cmd = "blastdbcmd -recursive -list #{database_dir}" \
|
107
|
+
' -list_outfmt "%f %t %p"'
|
108
|
+
out, _ = sys(cmd, path: config[:bin])
|
109
|
+
out
|
110
|
+
end
|
111
|
+
|
112
|
+
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
113
|
+
def make_blast_database(file, title, type)
|
114
|
+
return unless make_blast_database? file, type
|
115
|
+
title = confirm_database_title(title)
|
116
|
+
taxid = fetch_tax_id
|
117
|
+
_make_blast_database(file, type, title, taxid)
|
118
|
+
end
|
119
|
+
|
120
|
+
def _make_blast_database(file, type, title, taxid)
|
121
|
+
extract_fasta(file) unless File.exist?(file)
|
122
|
+
cmd = "makeblastdb -parse_seqids -hash_index -in #{file} " \
|
123
|
+
"-dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
124
|
+
" -taxid #{taxid}"
|
125
|
+
out, err = sys(cmd, path: config[:bin])
|
126
|
+
puts out.strip
|
127
|
+
puts err.strip
|
128
|
+
rescue CommandFailed => e
|
129
|
+
puts <<~MSG
|
130
|
+
Could not create BLAST database for: #{file}
|
131
|
+
Tried: #{cmd}
|
132
|
+
stdout: #{e.stdout}
|
133
|
+
stderr: #{e.stderr}
|
134
|
+
MSG
|
135
|
+
exit!
|
136
|
+
end
|
137
|
+
|
138
|
+
# Show file path and guessed sequence type to the user and obtain a y/n
|
139
|
+
# response.
|
140
|
+
#
|
141
|
+
# Returns true if the user entered anything but 'n' or 'N'.
|
142
|
+
def make_blast_database?(file, type)
|
143
|
+
puts
|
144
|
+
puts
|
145
|
+
puts "FASTA file to format/reformat: #{file}"
|
146
|
+
puts "FASTA type: #{type}"
|
147
|
+
print 'Proceed? [y/n] (Default: y): '
|
148
|
+
response = STDIN.gets.to_s.strip
|
149
|
+
!response.match(/n/i)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Show the database title that we are going to use to the user for
|
153
|
+
# confirmation.
|
154
|
+
#
|
155
|
+
# Returns user input if any. Auto-determined title otherwise.
|
156
|
+
def confirm_database_title(default)
|
157
|
+
print "Enter a database title or will use '#{default}': "
|
158
|
+
from_user = STDIN.gets.to_s.strip
|
159
|
+
from_user.empty? && default || from_user
|
160
|
+
end
|
161
|
+
|
162
|
+
# Get taxid from the user. Returns user input or 0.
|
163
|
+
#
|
164
|
+
# Using 0 as taxid is equivalent to not setting taxid for the database
|
165
|
+
# that will be created.
|
166
|
+
def fetch_tax_id
|
167
|
+
default = 0
|
168
|
+
print 'Enter taxid (optional): '
|
169
|
+
user_response = STDIN.gets.strip
|
170
|
+
user_response.empty? && default || Integer(user_response)
|
171
|
+
rescue
|
172
|
+
puts 'taxid should be a number'
|
173
|
+
retry
|
174
|
+
end
|
175
|
+
|
176
|
+
# Extract FASTA file from BLAST database.
|
177
|
+
#
|
178
|
+
# Invoked while reformatting a BLAST database if the corresponding
|
179
|
+
# FASTA file does not exist.
|
180
|
+
def extract_fasta(db)
|
181
|
+
puts
|
182
|
+
puts 'Extracting sequences ...'
|
183
|
+
cmd = "blastdbcmd -entry all -db #{db}"
|
184
|
+
sys(cmd, stdout: db, path: config[:bin])
|
185
|
+
rescue CommandFailed => e
|
186
|
+
puts <<~MSG
|
187
|
+
Could not extract sequences from: #{db}
|
188
|
+
Tried: #{cmd}
|
189
|
+
stdout: #{e.stdout}
|
190
|
+
stderr: #{e.stderr}
|
191
|
+
MSG
|
192
|
+
exit!
|
193
|
+
end
|
194
|
+
|
195
|
+
# Returns true if the database name appears to be a multi-part database
|
196
|
+
# name.
|
197
|
+
def multipart_database_name?(db_name)
|
198
|
+
Database.multipart_database_name? db_name
|
199
|
+
end
|
200
|
+
|
201
|
+
# Returns true if first character of the file is '>'.
|
202
|
+
def probably_fasta?(file)
|
203
|
+
File.read(file, 1) == '>'
|
204
|
+
end
|
205
|
+
|
206
|
+
# Suggests improved titles when generating database names from files
|
207
|
+
# for improved apperance and readability in web interface.
|
208
|
+
# For example:
|
209
|
+
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
|
210
|
+
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
|
211
|
+
def make_db_title(db_name)
|
212
|
+
db_name.tr!('"', "'")
|
213
|
+
# removes .fasta like extension names
|
214
|
+
db_name.gsub!(File.extname(db_name), '')
|
215
|
+
# replaces _ with ' ',
|
216
|
+
db_name.gsub!(/(_)/, ' ')
|
217
|
+
# replaces '.' with ' ' when no numbers are on either side,
|
218
|
+
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
|
219
|
+
# preserves version numbers
|
220
|
+
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
|
221
|
+
db_name
|
222
|
+
end
|
223
|
+
|
224
|
+
# Guess whether FASTA file contains protein or nucleotide sequences by
|
225
|
+
# sampling a few few characters of the file.
|
226
|
+
def guess_sequence_type_in_fasta(file)
|
227
|
+
sequences = sample_sequences(file)
|
228
|
+
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
|
229
|
+
sequence_types = sequence_types.uniq.compact
|
230
|
+
(sequence_types.length == 1) && sequence_types.first
|
231
|
+
end
|
232
|
+
|
233
|
+
# Read first 1,048,576 characters of the file, split the read text on
|
234
|
+
# fasta def line pattern and return.
|
235
|
+
#
|
236
|
+
# If the given file is FASTA, returns Array of as many different
|
237
|
+
# sequences in the portion of the file read. Returns the portion
|
238
|
+
# of the file read wrapped in an Array otherwise.
|
239
|
+
def sample_sequences(file)
|
240
|
+
File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|