sequenceserver 2.0.0.beta4 → 2.0.0.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.dockerignore +1 -0
- data/.travis.yml +7 -4
- data/AppImage/sequenceserver.sh +5 -0
- data/Dockerfile +14 -12
- data/bin/sequenceserver +37 -28
- data/lib/sequenceserver.rb +35 -7
- data/lib/sequenceserver/blast/job.rb +18 -25
- data/lib/sequenceserver/blast/report.rb +68 -34
- data/lib/sequenceserver/config.rb +1 -1
- data/lib/sequenceserver/database.rb +0 -129
- data/lib/sequenceserver/makeblastdb.rb +243 -0
- data/lib/sequenceserver/routes.rb +28 -2
- data/lib/sequenceserver/version.rb +1 -1
- data/public/SequenceServer_logo.png +0 -0
- data/public/css/grapher.css +8 -15
- data/public/css/sequenceserver.css +119 -55
- data/public/css/sequenceserver.min.css +3 -3
- data/public/js/circos.js +1 -1
- data/public/js/download_fasta.js +17 -0
- data/public/js/grapher.js +7 -9
- data/public/js/hit.js +217 -0
- data/public/js/hits_overview.js +12 -13
- data/public/js/hsp.js +104 -84
- data/public/js/{sequenceserver.js → jquery_world.js} +1 -18
- data/public/js/kablammo.js +337 -334
- data/public/js/length_distribution.js +1 -1
- data/public/js/query.js +147 -0
- data/public/js/report.js +216 -836
- data/public/js/search.js +194 -192
- data/public/js/sequence_modal.js +167 -0
- data/public/js/sidebar.js +210 -0
- data/public/js/utils.js +2 -19
- data/public/js/visualisation_helpers.js +2 -2
- data/public/sequenceserver-report.min.js +19 -19
- data/public/sequenceserver-search.min.js +11 -11
- data/public/vendor/github/twbs/bootstrap@3.3.5/js/bootstrap.js +2 -2
- data/spec/blast_versions/blast_2.2.30/import_spec_capybara_local_2.2.30.rb +15 -15
- data/spec/blast_versions/blast_2.2.31/import_spec_capybara_local_2.2.31.rb +15 -15
- data/spec/blast_versions/blast_2.3.0/import_spec_capybara_local_2.3.0.rb +15 -15
- data/spec/blast_versions/blast_2.4.0/import_spec_capybara_local_2.4.0.rb +15 -15
- data/spec/blast_versions/blast_2.5.0/import_spec_capybara_local_2.5.0.rb +15 -15
- data/spec/blast_versions/blast_2.6.0/import_spec_capybara_local_2.6.0.rb +15 -15
- data/spec/blast_versions/blast_2.7.1/import_spec_capybara_local_2.7.1.rb +15 -15
- data/spec/blast_versions/blast_2.8.1/import_spec_capybara_local_2.8.1.rb +15 -15
- data/spec/blast_versions/blast_2.9.0/import_spec_capybara_local_2.9.0.rb +15 -15
- data/spec/blast_versions/diamond_0.9.24/import_spec_capybara_local_0.9.24.rb +6 -6
- data/spec/capybara_spec.rb +14 -3
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.ndb +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhr +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nin +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nos +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.not +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.ntf +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nto +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pdb +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pos +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pot +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.ptf +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pto +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pdb +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phr +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pin +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pos +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pot +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.ptf +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pto +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.ndb +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nos +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.not +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.ntf +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nto +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhd +8 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhi +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhr +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nin +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nog +0 -0
- data/spec/database/{sample → v4}/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsd +0 -0
- data/spec/database/{sample → v4}/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsi +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsq +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.txt +8 -0
- data/spec/database/v4/links.rb +23 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta +6449 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phd +1189 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phi +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pog +0 -0
- data/spec/database/{sample → v4}/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psd +0 -0
- data/spec/database/{sample → v4}/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psi +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psq +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phd +9140 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phi +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phr +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pin +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pog +0 -0
- data/spec/database/{sample → v4}/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psd +0 -0
- data/spec/database/{sample → v4}/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psi +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psq +0 -0
- data/spec/database/v4/proteins/uniprot/URL +1 -0
- data/spec/database/v4/si_uniprot_idmap.yml +14180 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta +5486 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhd +473 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhi +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nog +0 -0
- data/spec/database/{sample → v4}/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsd +0 -0
- data/spec/database/{sample → v4}/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsi +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
- data/spec/database_spec.rb +0 -76
- data/spec/makeblastdb_spec.rb +121 -0
- data/views/layout.erb +5 -1
- metadata +75 -15
@@ -81,7 +81,7 @@ module SequenceServer
|
|
81
81
|
# otherwise.
|
82
82
|
def parse_config_file
|
83
83
|
unless file? config_file
|
84
|
-
logger.
|
84
|
+
logger.debug "Configuration file not found: #{config_file}"
|
85
85
|
return {}
|
86
86
|
end
|
87
87
|
logger.info "Reading configuration file: #{config_file}."
|
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'find'
|
2
1
|
require 'open3'
|
3
2
|
require 'digest/md5'
|
4
3
|
require 'forwardable'
|
@@ -209,89 +208,6 @@ module SequenceServer
|
|
209
208
|
end
|
210
209
|
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
211
210
|
|
212
|
-
# Recursively scan `database_dir` for un-formatted FASTA and format them
|
213
|
-
# for use with BLAST+.
|
214
|
-
def make_blast_databases
|
215
|
-
unformatted_fastas.select do |file, sequence_type|
|
216
|
-
make_blast_database(file, sequence_type)
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
# Returns an Array of FASTA files that may require formatting, and the
|
221
|
-
# type of sequence contained in each FASTA.
|
222
|
-
#
|
223
|
-
# > unformatted_fastas
|
224
|
-
# => [['/foo/bar.fasta', :nulceotide], ...]
|
225
|
-
def unformatted_fastas
|
226
|
-
list = []
|
227
|
-
database_dir = config[:database_dir]
|
228
|
-
Find.find database_dir do |file|
|
229
|
-
next if File.directory? file
|
230
|
-
next if Database.include? file
|
231
|
-
next unless probably_fasta? file
|
232
|
-
sequence_type = guess_sequence_type_in_fasta file
|
233
|
-
if %i[protein nucleotide].include?(sequence_type)
|
234
|
-
list << [file, sequence_type]
|
235
|
-
end
|
236
|
-
end
|
237
|
-
list
|
238
|
-
end
|
239
|
-
|
240
|
-
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
241
|
-
def make_blast_database(file, type)
|
242
|
-
return unless make_blast_database? file, type
|
243
|
-
title = get_database_title(file)
|
244
|
-
taxid = fetch_tax_id
|
245
|
-
_make_blast_database(file, type, title, taxid)
|
246
|
-
end
|
247
|
-
|
248
|
-
def _make_blast_database(file, type, title, taxid, quiet = false)
|
249
|
-
cmd = 'makeblastdb -parse_seqids -hash_index ' \
|
250
|
-
"-in #{file} -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
251
|
-
" -taxid #{taxid}"
|
252
|
-
out, err = sys(cmd, path: config[:bin])
|
253
|
-
puts out, err unless quiet
|
254
|
-
end
|
255
|
-
|
256
|
-
# Show file path and guessed sequence type to the user and obtain a y/n
|
257
|
-
# response.
|
258
|
-
#
|
259
|
-
# Returns true if the user entered anything but 'n' or 'N'.
|
260
|
-
def make_blast_database?(file, type)
|
261
|
-
puts
|
262
|
-
puts
|
263
|
-
puts "FASTA file: #{file}"
|
264
|
-
puts "FASTA type: #{type}"
|
265
|
-
print 'Proceed? [y/n] (Default: y): '
|
266
|
-
response = STDIN.gets.to_s.strip
|
267
|
-
!response.match(/n/i)
|
268
|
-
end
|
269
|
-
|
270
|
-
# Generate a title for the given database and show it to the user for
|
271
|
-
# confirmation.
|
272
|
-
#
|
273
|
-
# Returns user input if any. Auto-generated title otherwise.
|
274
|
-
def get_database_title(path)
|
275
|
-
default = make_db_title(File.basename(path))
|
276
|
-
print "Enter a database title or will use '#{default}': "
|
277
|
-
from_user = STDIN.gets.to_s.strip
|
278
|
-
from_user.empty? && default || from_user
|
279
|
-
end
|
280
|
-
|
281
|
-
# Get taxid from the user. Returns user input or 0.
|
282
|
-
#
|
283
|
-
# Using 0 as taxid is equivalent to not setting taxid for the database
|
284
|
-
# that will be created.
|
285
|
-
def fetch_tax_id
|
286
|
-
default = 0
|
287
|
-
print 'Enter taxid (optional): '
|
288
|
-
user_response = STDIN.gets.strip
|
289
|
-
user_response.empty? && default || Integer(user_response)
|
290
|
-
rescue
|
291
|
-
puts 'taxid should be a number'
|
292
|
-
retry
|
293
|
-
end
|
294
|
-
|
295
211
|
# Returns true if the database name appears to be a multi-part database
|
296
212
|
# name.
|
297
213
|
#
|
@@ -304,51 +220,6 @@ module SequenceServer
|
|
304
220
|
def multipart_database_name?(db_name)
|
305
221
|
!(db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?)
|
306
222
|
end
|
307
|
-
|
308
|
-
# Returns true if first character of the file is '>'.
|
309
|
-
def probably_fasta?(file)
|
310
|
-
File.read(file, 1) == '>'
|
311
|
-
end
|
312
|
-
|
313
|
-
# Suggests improved titles when generating database names from files
|
314
|
-
# for improved apperance and readability in web interface.
|
315
|
-
# For example:
|
316
|
-
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
|
317
|
-
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
|
318
|
-
def make_db_title(db_name)
|
319
|
-
db_name.tr!('"', "'")
|
320
|
-
# removes .fasta like extension names
|
321
|
-
db_name.gsub!(File.extname(db_name), '')
|
322
|
-
# replaces _ with ' ',
|
323
|
-
db_name.gsub!(/(_)/, ' ')
|
324
|
-
# replaces '.' with ' ' when no numbers are on either side,
|
325
|
-
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
|
326
|
-
# preserves version numbers
|
327
|
-
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
|
328
|
-
db_name
|
329
|
-
end
|
330
|
-
|
331
|
-
# Guess whether FASTA file contains protein or nucleotide sequences based
|
332
|
-
# on first 32768 characters.
|
333
|
-
#
|
334
|
-
# NOTE: 2^15 == 32786. Approximately 546 lines, assuming 60 characters on
|
335
|
-
# each line.
|
336
|
-
def guess_sequence_type_in_fasta(file)
|
337
|
-
sequences = sample_sequences(file)
|
338
|
-
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
|
339
|
-
sequence_types = sequence_types.uniq.compact
|
340
|
-
(sequence_types.length == 1) && sequence_types.first
|
341
|
-
end
|
342
|
-
|
343
|
-
# Read first 32768 characters of the file. Split on fasta def line
|
344
|
-
# pattern and return.
|
345
|
-
#
|
346
|
-
# If the given file is FASTA, returns Array of as many different
|
347
|
-
# sequences in the portion of the file read. Returns the portion
|
348
|
-
# of the file read wrapped in an Array otherwise.
|
349
|
-
def sample_sequences(file)
|
350
|
-
File.read(file, 32_768).split(/^>.+$/).delete_if(&:empty?)
|
351
|
-
end
|
352
223
|
end
|
353
224
|
end
|
354
225
|
end
|
@@ -0,0 +1,243 @@
|
|
1
|
+
require 'find'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module SequenceServer
|
5
|
+
# Smart `makeblastdb` wrapper: recursively scans database directory determining
|
6
|
+
# which files need to be formatted or re-formatted.
|
7
|
+
#
|
8
|
+
# Example usage:
|
9
|
+
#
|
10
|
+
# makeblastdb = MAKEBLASTDB.new(database_dir)
|
11
|
+
# makeblastdb.scan && makeblastdb.run
|
12
|
+
#
|
13
|
+
class MAKEBLASTDB
|
14
|
+
# We want V5 databases created using -parse_seqids for proper function of
|
15
|
+
# SequenceServer. This means each database should be comprised of at least 9
|
16
|
+
# files with the following extensions. Databases created by us will have two
|
17
|
+
# additional files with the extensions nhd and nhi, or phd and phi, due to
|
18
|
+
# the use of -hash_index option. Finally, multipart databases will have one
|
19
|
+
# additional file with the extension nal or pal.
|
20
|
+
REQUIRED_EXTENSIONS = {
|
21
|
+
'nucleotide' => %w{ndb nhr nin nog nos not nsq ntf nto}.freeze,
|
22
|
+
'protein' => %w{pdb phr pin pog pos pot psq ptf pto}.freeze
|
23
|
+
}
|
24
|
+
|
25
|
+
extend Forwardable
|
26
|
+
|
27
|
+
def_delegators SequenceServer, :config, :sys
|
28
|
+
|
29
|
+
def initialize(database_dir)
|
30
|
+
@database_dir = database_dir
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_reader :database_dir
|
34
|
+
|
35
|
+
# Scans the database directory to determine which FASTA files require
|
36
|
+
# formatting or re-formatting.
|
37
|
+
#
|
38
|
+
# Returns `true` if there are files to (re-)format, `false` otherwise.
|
39
|
+
def scan
|
40
|
+
# We need to know the list of formatted FASTAs as reported by blastdbcmd
|
41
|
+
# first. This is required to determine both unformatted FASTAs and those
|
42
|
+
# that require reformatting.
|
43
|
+
@formatted_fastas = []
|
44
|
+
determine_formatted_fastas
|
45
|
+
|
46
|
+
# Now determine FASTA files that are unformatted or require reformatting.
|
47
|
+
@fastas_to_format = []
|
48
|
+
determine_unformatted_fastas
|
49
|
+
determine_fastas_to_reformat
|
50
|
+
|
51
|
+
# Return true if there are files to be (re-)formatted or false otherwise.
|
52
|
+
!@fastas_to_format.empty?
|
53
|
+
end
|
54
|
+
|
55
|
+
# Runs makeblastdb on each file in `@fastas_to_format`. Will do nothing
|
56
|
+
# unless `#scan` has been run before.
|
57
|
+
def run
|
58
|
+
return unless @fastas_to_format || @fastas_to_format.empty?
|
59
|
+
@fastas_to_format.each do |path, title, type|
|
60
|
+
make_blast_database(path, title, type)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
# Determines which FASTA files in the database directory are already
|
67
|
+
# formatted. Adds to @formatted_fastas.
|
68
|
+
def determine_formatted_fastas
|
69
|
+
blastdbcmd.each_line do |line|
|
70
|
+
path, title, type = line.split(' ')
|
71
|
+
next if multipart_database_name?(path)
|
72
|
+
@formatted_fastas << [path, title, type.strip.downcase]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Determines which FASTA files in the database directory require
|
77
|
+
# reformatting. Adds to @fastas_to_format.
|
78
|
+
def determine_fastas_to_reformat
|
79
|
+
@formatted_fastas.each do |path, title, type|
|
80
|
+
required_extensions = REQUIRED_EXTENSIONS[type]
|
81
|
+
exts = Dir["#{path}.*"].map { |p| p.split('.').last }.sort
|
82
|
+
next if (exts & required_extensions) == required_extensions
|
83
|
+
|
84
|
+
@fastas_to_format << [path, title, type]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Determines which FASTA files in the database directory are
|
89
|
+
# unformatted. Adds to @fastas_to_format.
|
90
|
+
def determine_unformatted_fastas
|
91
|
+
Find.find(database_dir) do |path|
|
92
|
+
next if File.directory?(path)
|
93
|
+
next unless probably_fasta?(path)
|
94
|
+
next if @formatted_fastas.any? { |f| f[0] == path }
|
95
|
+
|
96
|
+
@fastas_to_format << [path,
|
97
|
+
make_db_title(File.basename(path)),
|
98
|
+
guess_sequence_type_in_fasta(path)]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Runs `blastdbcmd` to determine formatted FASTA files in the database
|
103
|
+
# directory. Returns the output of `blastdbcmd`. This method is called
|
104
|
+
# by `determine_formatted_fastas`.
|
105
|
+
def blastdbcmd
|
106
|
+
cmd = "blastdbcmd -recursive -list #{database_dir}" \
|
107
|
+
' -list_outfmt "%f %t %p"'
|
108
|
+
out, _ = sys(cmd, path: config[:bin])
|
109
|
+
out
|
110
|
+
end
|
111
|
+
|
112
|
+
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
113
|
+
def make_blast_database(file, title, type)
|
114
|
+
return unless make_blast_database? file, type
|
115
|
+
title = confirm_database_title(title)
|
116
|
+
taxid = fetch_tax_id
|
117
|
+
_make_blast_database(file, type, title, taxid)
|
118
|
+
end
|
119
|
+
|
120
|
+
def _make_blast_database(file, type, title, taxid)
|
121
|
+
extract_fasta(file) unless File.exist?(file)
|
122
|
+
cmd = "makeblastdb -parse_seqids -hash_index -in #{file} " \
|
123
|
+
"-dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
124
|
+
" -taxid #{taxid}"
|
125
|
+
out, err = sys(cmd, path: config[:bin])
|
126
|
+
puts out.strip
|
127
|
+
puts err.strip
|
128
|
+
rescue CommandFailed => e
|
129
|
+
puts <<~MSG
|
130
|
+
Could not create BLAST database for: #{file}
|
131
|
+
Tried: #{cmd}
|
132
|
+
stdout: #{e.stdout}
|
133
|
+
stderr: #{e.stderr}
|
134
|
+
MSG
|
135
|
+
exit!
|
136
|
+
end
|
137
|
+
|
138
|
+
# Show file path and guessed sequence type to the user and obtain a y/n
|
139
|
+
# response.
|
140
|
+
#
|
141
|
+
# Returns true if the user entered anything but 'n' or 'N'.
|
142
|
+
def make_blast_database?(file, type)
|
143
|
+
puts
|
144
|
+
puts
|
145
|
+
puts "FASTA file to format/reformat: #{file}"
|
146
|
+
puts "FASTA type: #{type}"
|
147
|
+
print 'Proceed? [y/n] (Default: y): '
|
148
|
+
response = STDIN.gets.to_s.strip
|
149
|
+
!response.match(/n/i)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Show the database title that we are going to use to the user for
|
153
|
+
# confirmation.
|
154
|
+
#
|
155
|
+
# Returns user input if any. Auto-determined title otherwise.
|
156
|
+
def confirm_database_title(default)
|
157
|
+
print "Enter a database title or will use '#{default}': "
|
158
|
+
from_user = STDIN.gets.to_s.strip
|
159
|
+
from_user.empty? && default || from_user
|
160
|
+
end
|
161
|
+
|
162
|
+
# Get taxid from the user. Returns user input or 0.
|
163
|
+
#
|
164
|
+
# Using 0 as taxid is equivalent to not setting taxid for the database
|
165
|
+
# that will be created.
|
166
|
+
def fetch_tax_id
|
167
|
+
default = 0
|
168
|
+
print 'Enter taxid (optional): '
|
169
|
+
user_response = STDIN.gets.strip
|
170
|
+
user_response.empty? && default || Integer(user_response)
|
171
|
+
rescue
|
172
|
+
puts 'taxid should be a number'
|
173
|
+
retry
|
174
|
+
end
|
175
|
+
|
176
|
+
# Extract FASTA file from BLAST database.
|
177
|
+
#
|
178
|
+
# Invoked while reformatting a BLAST database if the corresponding
|
179
|
+
# FASTA file does not exist.
|
180
|
+
def extract_fasta(db)
|
181
|
+
puts
|
182
|
+
puts 'Extracting sequences ...'
|
183
|
+
cmd = "blastdbcmd -entry all -db #{db}"
|
184
|
+
sys(cmd, stdout: db, path: config[:bin])
|
185
|
+
rescue CommandFailed => e
|
186
|
+
puts <<~MSG
|
187
|
+
Could not extract sequences from: #{db}
|
188
|
+
Tried: #{cmd}
|
189
|
+
stdout: #{e.stdout}
|
190
|
+
stderr: #{e.stderr}
|
191
|
+
MSG
|
192
|
+
exit!
|
193
|
+
end
|
194
|
+
|
195
|
+
# Returns true if the database name appears to be a multi-part database
|
196
|
+
# name.
|
197
|
+
def multipart_database_name?(db_name)
|
198
|
+
Database.multipart_database_name? db_name
|
199
|
+
end
|
200
|
+
|
201
|
+
# Returns true if first character of the file is '>'.
|
202
|
+
def probably_fasta?(file)
|
203
|
+
File.read(file, 1) == '>'
|
204
|
+
end
|
205
|
+
|
206
|
+
# Suggests improved titles when generating database names from files
|
207
|
+
# for improved apperance and readability in web interface.
|
208
|
+
# For example:
|
209
|
+
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
|
210
|
+
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
|
211
|
+
def make_db_title(db_name)
|
212
|
+
db_name.tr!('"', "'")
|
213
|
+
# removes .fasta like extension names
|
214
|
+
db_name.gsub!(File.extname(db_name), '')
|
215
|
+
# replaces _ with ' ',
|
216
|
+
db_name.gsub!(/(_)/, ' ')
|
217
|
+
# replaces '.' with ' ' when no numbers are on either side,
|
218
|
+
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
|
219
|
+
# preserves version numbers
|
220
|
+
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
|
221
|
+
db_name
|
222
|
+
end
|
223
|
+
|
224
|
+
# Guess whether FASTA file contains protein or nucleotide sequences by
|
225
|
+
# sampling a few few characters of the file.
|
226
|
+
def guess_sequence_type_in_fasta(file)
|
227
|
+
sequences = sample_sequences(file)
|
228
|
+
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
|
229
|
+
sequence_types = sequence_types.uniq.compact
|
230
|
+
(sequence_types.length == 1) && sequence_types.first
|
231
|
+
end
|
232
|
+
|
233
|
+
# Read first 1,048,576 characters of the file, split the read text on
|
234
|
+
# fasta def line pattern and return.
|
235
|
+
#
|
236
|
+
# If the given file is FASTA, returns Array of as many different
|
237
|
+
# sequences in the portion of the file read. Returns the portion
|
238
|
+
# of the file read wrapped in an Array otherwise.
|
239
|
+
def sample_sequences(file)
|
240
|
+
File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
@@ -69,10 +69,16 @@ module SequenceServer
|
|
69
69
|
# include available databases and user-defined search options.
|
70
70
|
get '/searchdata.json' do
|
71
71
|
searchdata = {
|
72
|
+
query: Database.retrieve(params[:query]),
|
72
73
|
database: Database.all,
|
73
|
-
options: SequenceServer.config[:options]
|
74
|
-
query: Database.retrieve(params[:query])
|
74
|
+
options: SequenceServer.config[:options]
|
75
75
|
}
|
76
|
+
|
77
|
+
# If a job_id is specified, update searchdata from job meta data (i.e.,
|
78
|
+
# query, pre-selected databases, advanced options used). Query is only
|
79
|
+
# updated if params[:query] is not specified.
|
80
|
+
update_searchdata_from_job(searchdata) if params[:job_id]
|
81
|
+
|
76
82
|
searchdata.to_json
|
77
83
|
end
|
78
84
|
|
@@ -180,5 +186,25 @@ module SequenceServer
|
|
180
186
|
|
181
187
|
error_data.to_json
|
182
188
|
end
|
189
|
+
|
190
|
+
# Get the query sequences, selected databases, and advanced params used.
|
191
|
+
def update_searchdata_from_job(searchdata)
|
192
|
+
job = Job.fetch(params[:job_id])
|
193
|
+
return if job.imported_xml_file
|
194
|
+
|
195
|
+
# Only read job.qfile if we are not going to use Database.retrieve.
|
196
|
+
searchdata[:query] = File.read(job.qfile) if !params[:query]
|
197
|
+
|
198
|
+
# Which databases to pre-select.
|
199
|
+
searchdata[:preSelectedDbs] = job.databases
|
200
|
+
|
201
|
+
# job.advanced may be nil in case of old jobs. In this case, we do not
|
202
|
+
# override searchdata so that default advanced parameters can be applied.
|
203
|
+
# Note that, job.advanced will be an empty string if a user deletes the
|
204
|
+
# default advanced parameters from the advanced params input field. In
|
205
|
+
# this case, we do want the advanced params input field to be empty when
|
206
|
+
# the user hits the back button. Thus we do not test for empty string.
|
207
|
+
searchdata[:options][job.method] = [job.advanced] if job.advanced
|
208
|
+
end
|
183
209
|
end
|
184
210
|
end
|