sequenceserver 2.0.0.beta4 → 2.0.0.rc5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.dockerignore +1 -0
- data/.travis.yml +7 -4
- data/AppImage/sequenceserver.sh +5 -0
- data/Dockerfile +14 -12
- data/bin/sequenceserver +37 -28
- data/lib/sequenceserver.rb +35 -7
- data/lib/sequenceserver/blast/job.rb +18 -25
- data/lib/sequenceserver/blast/report.rb +68 -34
- data/lib/sequenceserver/config.rb +1 -1
- data/lib/sequenceserver/database.rb +0 -129
- data/lib/sequenceserver/makeblastdb.rb +243 -0
- data/lib/sequenceserver/routes.rb +28 -2
- data/lib/sequenceserver/version.rb +1 -1
- data/public/SequenceServer_logo.png +0 -0
- data/public/css/grapher.css +8 -15
- data/public/css/sequenceserver.css +119 -55
- data/public/css/sequenceserver.min.css +3 -3
- data/public/js/circos.js +1 -1
- data/public/js/download_fasta.js +17 -0
- data/public/js/grapher.js +7 -9
- data/public/js/hit.js +217 -0
- data/public/js/hits_overview.js +12 -13
- data/public/js/hsp.js +104 -84
- data/public/js/{sequenceserver.js → jquery_world.js} +1 -18
- data/public/js/kablammo.js +337 -334
- data/public/js/length_distribution.js +1 -1
- data/public/js/query.js +147 -0
- data/public/js/report.js +216 -836
- data/public/js/search.js +194 -192
- data/public/js/sequence_modal.js +167 -0
- data/public/js/sidebar.js +210 -0
- data/public/js/utils.js +2 -19
- data/public/js/visualisation_helpers.js +2 -2
- data/public/sequenceserver-report.min.js +19 -19
- data/public/sequenceserver-search.min.js +11 -11
- data/public/vendor/github/twbs/bootstrap@3.3.5/js/bootstrap.js +2 -2
- data/spec/blast_versions/blast_2.2.30/import_spec_capybara_local_2.2.30.rb +15 -15
- data/spec/blast_versions/blast_2.2.31/import_spec_capybara_local_2.2.31.rb +15 -15
- data/spec/blast_versions/blast_2.3.0/import_spec_capybara_local_2.3.0.rb +15 -15
- data/spec/blast_versions/blast_2.4.0/import_spec_capybara_local_2.4.0.rb +15 -15
- data/spec/blast_versions/blast_2.5.0/import_spec_capybara_local_2.5.0.rb +15 -15
- data/spec/blast_versions/blast_2.6.0/import_spec_capybara_local_2.6.0.rb +15 -15
- data/spec/blast_versions/blast_2.7.1/import_spec_capybara_local_2.7.1.rb +15 -15
- data/spec/blast_versions/blast_2.8.1/import_spec_capybara_local_2.8.1.rb +15 -15
- data/spec/blast_versions/blast_2.9.0/import_spec_capybara_local_2.9.0.rb +15 -15
- data/spec/blast_versions/diamond_0.9.24/import_spec_capybara_local_0.9.24.rb +6 -6
- data/spec/capybara_spec.rb +14 -3
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.ndb +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhr +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nin +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nos +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.not +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.ntf +0 -0
- data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nto +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pdb +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pos +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pot +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.ptf +0 -0
- data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pto +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pdb +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phr +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pin +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pos +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pot +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.ptf +0 -0
- data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pto +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.ndb +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nos +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.not +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.ntf +0 -0
- data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nto +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhd +8 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhi +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhr +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nin +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nog +0 -0
- data/spec/database/{sample → v4}/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsd +0 -0
- data/spec/database/{sample → v4}/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsi +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsq +0 -0
- data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.txt +8 -0
- data/spec/database/v4/links.rb +23 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta +6449 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phd +1189 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phi +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pog +0 -0
- data/spec/database/{sample → v4}/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psd +0 -0
- data/spec/database/{sample → v4}/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psi +0 -0
- data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psq +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phd +9140 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phi +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phr +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pin +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pog +0 -0
- data/spec/database/{sample → v4}/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psd +0 -0
- data/spec/database/{sample → v4}/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psi +0 -0
- data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psq +0 -0
- data/spec/database/v4/proteins/uniprot/URL +1 -0
- data/spec/database/v4/si_uniprot_idmap.yml +14180 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta +5486 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhd +473 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhi +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nog +0 -0
- data/spec/database/{sample → v4}/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsd +0 -0
- data/spec/database/{sample → v4}/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsi +0 -0
- data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
- data/spec/database_spec.rb +0 -76
- data/spec/makeblastdb_spec.rb +121 -0
- data/views/layout.erb +5 -1
- metadata +75 -15
|
@@ -81,7 +81,7 @@ module SequenceServer
|
|
|
81
81
|
# otherwise.
|
|
82
82
|
def parse_config_file
|
|
83
83
|
unless file? config_file
|
|
84
|
-
logger.
|
|
84
|
+
logger.debug "Configuration file not found: #{config_file}"
|
|
85
85
|
return {}
|
|
86
86
|
end
|
|
87
87
|
logger.info "Reading configuration file: #{config_file}."
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
require 'find'
|
|
2
1
|
require 'open3'
|
|
3
2
|
require 'digest/md5'
|
|
4
3
|
require 'forwardable'
|
|
@@ -209,89 +208,6 @@ module SequenceServer
|
|
|
209
208
|
end
|
|
210
209
|
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
|
211
210
|
|
|
212
|
-
# Recursively scan `database_dir` for un-formatted FASTA and format them
|
|
213
|
-
# for use with BLAST+.
|
|
214
|
-
def make_blast_databases
|
|
215
|
-
unformatted_fastas.select do |file, sequence_type|
|
|
216
|
-
make_blast_database(file, sequence_type)
|
|
217
|
-
end
|
|
218
|
-
end
|
|
219
|
-
|
|
220
|
-
# Returns an Array of FASTA files that may require formatting, and the
|
|
221
|
-
# type of sequence contained in each FASTA.
|
|
222
|
-
#
|
|
223
|
-
# > unformatted_fastas
|
|
224
|
-
# => [['/foo/bar.fasta', :nulceotide], ...]
|
|
225
|
-
def unformatted_fastas
|
|
226
|
-
list = []
|
|
227
|
-
database_dir = config[:database_dir]
|
|
228
|
-
Find.find database_dir do |file|
|
|
229
|
-
next if File.directory? file
|
|
230
|
-
next if Database.include? file
|
|
231
|
-
next unless probably_fasta? file
|
|
232
|
-
sequence_type = guess_sequence_type_in_fasta file
|
|
233
|
-
if %i[protein nucleotide].include?(sequence_type)
|
|
234
|
-
list << [file, sequence_type]
|
|
235
|
-
end
|
|
236
|
-
end
|
|
237
|
-
list
|
|
238
|
-
end
|
|
239
|
-
|
|
240
|
-
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
|
241
|
-
def make_blast_database(file, type)
|
|
242
|
-
return unless make_blast_database? file, type
|
|
243
|
-
title = get_database_title(file)
|
|
244
|
-
taxid = fetch_tax_id
|
|
245
|
-
_make_blast_database(file, type, title, taxid)
|
|
246
|
-
end
|
|
247
|
-
|
|
248
|
-
def _make_blast_database(file, type, title, taxid, quiet = false)
|
|
249
|
-
cmd = 'makeblastdb -parse_seqids -hash_index ' \
|
|
250
|
-
"-in #{file} -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
|
251
|
-
" -taxid #{taxid}"
|
|
252
|
-
out, err = sys(cmd, path: config[:bin])
|
|
253
|
-
puts out, err unless quiet
|
|
254
|
-
end
|
|
255
|
-
|
|
256
|
-
# Show file path and guessed sequence type to the user and obtain a y/n
|
|
257
|
-
# response.
|
|
258
|
-
#
|
|
259
|
-
# Returns true if the user entered anything but 'n' or 'N'.
|
|
260
|
-
def make_blast_database?(file, type)
|
|
261
|
-
puts
|
|
262
|
-
puts
|
|
263
|
-
puts "FASTA file: #{file}"
|
|
264
|
-
puts "FASTA type: #{type}"
|
|
265
|
-
print 'Proceed? [y/n] (Default: y): '
|
|
266
|
-
response = STDIN.gets.to_s.strip
|
|
267
|
-
!response.match(/n/i)
|
|
268
|
-
end
|
|
269
|
-
|
|
270
|
-
# Generate a title for the given database and show it to the user for
|
|
271
|
-
# confirmation.
|
|
272
|
-
#
|
|
273
|
-
# Returns user input if any. Auto-generated title otherwise.
|
|
274
|
-
def get_database_title(path)
|
|
275
|
-
default = make_db_title(File.basename(path))
|
|
276
|
-
print "Enter a database title or will use '#{default}': "
|
|
277
|
-
from_user = STDIN.gets.to_s.strip
|
|
278
|
-
from_user.empty? && default || from_user
|
|
279
|
-
end
|
|
280
|
-
|
|
281
|
-
# Get taxid from the user. Returns user input or 0.
|
|
282
|
-
#
|
|
283
|
-
# Using 0 as taxid is equivalent to not setting taxid for the database
|
|
284
|
-
# that will be created.
|
|
285
|
-
def fetch_tax_id
|
|
286
|
-
default = 0
|
|
287
|
-
print 'Enter taxid (optional): '
|
|
288
|
-
user_response = STDIN.gets.strip
|
|
289
|
-
user_response.empty? && default || Integer(user_response)
|
|
290
|
-
rescue
|
|
291
|
-
puts 'taxid should be a number'
|
|
292
|
-
retry
|
|
293
|
-
end
|
|
294
|
-
|
|
295
211
|
# Returns true if the database name appears to be a multi-part database
|
|
296
212
|
# name.
|
|
297
213
|
#
|
|
@@ -304,51 +220,6 @@ module SequenceServer
|
|
|
304
220
|
def multipart_database_name?(db_name)
|
|
305
221
|
!(db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?)
|
|
306
222
|
end
|
|
307
|
-
|
|
308
|
-
# Returns true if first character of the file is '>'.
|
|
309
|
-
def probably_fasta?(file)
|
|
310
|
-
File.read(file, 1) == '>'
|
|
311
|
-
end
|
|
312
|
-
|
|
313
|
-
# Suggests improved titles when generating database names from files
|
|
314
|
-
# for improved apperance and readability in web interface.
|
|
315
|
-
# For example:
|
|
316
|
-
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
|
|
317
|
-
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
|
|
318
|
-
def make_db_title(db_name)
|
|
319
|
-
db_name.tr!('"', "'")
|
|
320
|
-
# removes .fasta like extension names
|
|
321
|
-
db_name.gsub!(File.extname(db_name), '')
|
|
322
|
-
# replaces _ with ' ',
|
|
323
|
-
db_name.gsub!(/(_)/, ' ')
|
|
324
|
-
# replaces '.' with ' ' when no numbers are on either side,
|
|
325
|
-
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
|
|
326
|
-
# preserves version numbers
|
|
327
|
-
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
|
|
328
|
-
db_name
|
|
329
|
-
end
|
|
330
|
-
|
|
331
|
-
# Guess whether FASTA file contains protein or nucleotide sequences based
|
|
332
|
-
# on first 32768 characters.
|
|
333
|
-
#
|
|
334
|
-
# NOTE: 2^15 == 32786. Approximately 546 lines, assuming 60 characters on
|
|
335
|
-
# each line.
|
|
336
|
-
def guess_sequence_type_in_fasta(file)
|
|
337
|
-
sequences = sample_sequences(file)
|
|
338
|
-
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
|
|
339
|
-
sequence_types = sequence_types.uniq.compact
|
|
340
|
-
(sequence_types.length == 1) && sequence_types.first
|
|
341
|
-
end
|
|
342
|
-
|
|
343
|
-
# Read first 32768 characters of the file. Split on fasta def line
|
|
344
|
-
# pattern and return.
|
|
345
|
-
#
|
|
346
|
-
# If the given file is FASTA, returns Array of as many different
|
|
347
|
-
# sequences in the portion of the file read. Returns the portion
|
|
348
|
-
# of the file read wrapped in an Array otherwise.
|
|
349
|
-
def sample_sequences(file)
|
|
350
|
-
File.read(file, 32_768).split(/^>.+$/).delete_if(&:empty?)
|
|
351
|
-
end
|
|
352
223
|
end
|
|
353
224
|
end
|
|
354
225
|
end
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
require 'find'
|
|
2
|
+
require 'forwardable'
|
|
3
|
+
|
|
4
|
+
module SequenceServer
|
|
5
|
+
# Smart `makeblastdb` wrapper: recursively scans database directory determining
|
|
6
|
+
# which files need to be formatted or re-formatted.
|
|
7
|
+
#
|
|
8
|
+
# Example usage:
|
|
9
|
+
#
|
|
10
|
+
# makeblastdb = MAKEBLASTDB.new(database_dir)
|
|
11
|
+
# makeblastdb.scan && makeblastdb.run
|
|
12
|
+
#
|
|
13
|
+
class MAKEBLASTDB
|
|
14
|
+
# We want V5 databases created using -parse_seqids for proper function of
|
|
15
|
+
# SequenceServer. This means each database should be comprised of at least 9
|
|
16
|
+
# files with the following extensions. Databases created by us will have two
|
|
17
|
+
# additional files with the extensions nhd and nhi, or phd and phi, due to
|
|
18
|
+
# the use of -hash_index option. Finally, multipart databases will have one
|
|
19
|
+
# additional file with the extension nal or pal.
|
|
20
|
+
REQUIRED_EXTENSIONS = {
|
|
21
|
+
'nucleotide' => %w{ndb nhr nin nog nos not nsq ntf nto}.freeze,
|
|
22
|
+
'protein' => %w{pdb phr pin pog pos pot psq ptf pto}.freeze
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
extend Forwardable
|
|
26
|
+
|
|
27
|
+
def_delegators SequenceServer, :config, :sys
|
|
28
|
+
|
|
29
|
+
def initialize(database_dir)
|
|
30
|
+
@database_dir = database_dir
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
attr_reader :database_dir
|
|
34
|
+
|
|
35
|
+
# Scans the database directory to determine which FASTA files require
|
|
36
|
+
# formatting or re-formatting.
|
|
37
|
+
#
|
|
38
|
+
# Returns `true` if there are files to (re-)format, `false` otherwise.
|
|
39
|
+
def scan
|
|
40
|
+
# We need to know the list of formatted FASTAs as reported by blastdbcmd
|
|
41
|
+
# first. This is required to determine both unformatted FASTAs and those
|
|
42
|
+
# that require reformatting.
|
|
43
|
+
@formatted_fastas = []
|
|
44
|
+
determine_formatted_fastas
|
|
45
|
+
|
|
46
|
+
# Now determine FASTA files that are unformatted or require reformatting.
|
|
47
|
+
@fastas_to_format = []
|
|
48
|
+
determine_unformatted_fastas
|
|
49
|
+
determine_fastas_to_reformat
|
|
50
|
+
|
|
51
|
+
# Return true if there are files to be (re-)formatted or false otherwise.
|
|
52
|
+
!@fastas_to_format.empty?
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Runs makeblastdb on each file in `@fastas_to_format`. Will do nothing
|
|
56
|
+
# unless `#scan` has been run before.
|
|
57
|
+
def run
|
|
58
|
+
return unless @fastas_to_format || @fastas_to_format.empty?
|
|
59
|
+
@fastas_to_format.each do |path, title, type|
|
|
60
|
+
make_blast_database(path, title, type)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
# Determines which FASTA files in the database directory are already
|
|
67
|
+
# formatted. Adds to @formatted_fastas.
|
|
68
|
+
def determine_formatted_fastas
|
|
69
|
+
blastdbcmd.each_line do |line|
|
|
70
|
+
path, title, type = line.split(' ')
|
|
71
|
+
next if multipart_database_name?(path)
|
|
72
|
+
@formatted_fastas << [path, title, type.strip.downcase]
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Determines which FASTA files in the database directory require
|
|
77
|
+
# reformatting. Adds to @fastas_to_format.
|
|
78
|
+
def determine_fastas_to_reformat
|
|
79
|
+
@formatted_fastas.each do |path, title, type|
|
|
80
|
+
required_extensions = REQUIRED_EXTENSIONS[type]
|
|
81
|
+
exts = Dir["#{path}.*"].map { |p| p.split('.').last }.sort
|
|
82
|
+
next if (exts & required_extensions) == required_extensions
|
|
83
|
+
|
|
84
|
+
@fastas_to_format << [path, title, type]
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Determines which FASTA files in the database directory are
|
|
89
|
+
# unformatted. Adds to @fastas_to_format.
|
|
90
|
+
def determine_unformatted_fastas
|
|
91
|
+
Find.find(database_dir) do |path|
|
|
92
|
+
next if File.directory?(path)
|
|
93
|
+
next unless probably_fasta?(path)
|
|
94
|
+
next if @formatted_fastas.any? { |f| f[0] == path }
|
|
95
|
+
|
|
96
|
+
@fastas_to_format << [path,
|
|
97
|
+
make_db_title(File.basename(path)),
|
|
98
|
+
guess_sequence_type_in_fasta(path)]
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Runs `blastdbcmd` to determine formatted FASTA files in the database
|
|
103
|
+
# directory. Returns the output of `blastdbcmd`. This method is called
|
|
104
|
+
# by `determine_formatted_fastas`.
|
|
105
|
+
def blastdbcmd
|
|
106
|
+
cmd = "blastdbcmd -recursive -list #{database_dir}" \
|
|
107
|
+
' -list_outfmt "%f %t %p"'
|
|
108
|
+
out, _ = sys(cmd, path: config[:bin])
|
|
109
|
+
out
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
|
113
|
+
def make_blast_database(file, title, type)
|
|
114
|
+
return unless make_blast_database? file, type
|
|
115
|
+
title = confirm_database_title(title)
|
|
116
|
+
taxid = fetch_tax_id
|
|
117
|
+
_make_blast_database(file, type, title, taxid)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def _make_blast_database(file, type, title, taxid)
|
|
121
|
+
extract_fasta(file) unless File.exist?(file)
|
|
122
|
+
cmd = "makeblastdb -parse_seqids -hash_index -in #{file} " \
|
|
123
|
+
"-dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
|
124
|
+
" -taxid #{taxid}"
|
|
125
|
+
out, err = sys(cmd, path: config[:bin])
|
|
126
|
+
puts out.strip
|
|
127
|
+
puts err.strip
|
|
128
|
+
rescue CommandFailed => e
|
|
129
|
+
puts <<~MSG
|
|
130
|
+
Could not create BLAST database for: #{file}
|
|
131
|
+
Tried: #{cmd}
|
|
132
|
+
stdout: #{e.stdout}
|
|
133
|
+
stderr: #{e.stderr}
|
|
134
|
+
MSG
|
|
135
|
+
exit!
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Show file path and guessed sequence type to the user and obtain a y/n
|
|
139
|
+
# response.
|
|
140
|
+
#
|
|
141
|
+
# Returns true if the user entered anything but 'n' or 'N'.
|
|
142
|
+
def make_blast_database?(file, type)
|
|
143
|
+
puts
|
|
144
|
+
puts
|
|
145
|
+
puts "FASTA file to format/reformat: #{file}"
|
|
146
|
+
puts "FASTA type: #{type}"
|
|
147
|
+
print 'Proceed? [y/n] (Default: y): '
|
|
148
|
+
response = STDIN.gets.to_s.strip
|
|
149
|
+
!response.match(/n/i)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Show the database title that we are going to use to the user for
|
|
153
|
+
# confirmation.
|
|
154
|
+
#
|
|
155
|
+
# Returns user input if any. Auto-determined title otherwise.
|
|
156
|
+
def confirm_database_title(default)
|
|
157
|
+
print "Enter a database title or will use '#{default}': "
|
|
158
|
+
from_user = STDIN.gets.to_s.strip
|
|
159
|
+
from_user.empty? && default || from_user
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Get taxid from the user. Returns user input or 0.
|
|
163
|
+
#
|
|
164
|
+
# Using 0 as taxid is equivalent to not setting taxid for the database
|
|
165
|
+
# that will be created.
|
|
166
|
+
def fetch_tax_id
|
|
167
|
+
default = 0
|
|
168
|
+
print 'Enter taxid (optional): '
|
|
169
|
+
user_response = STDIN.gets.strip
|
|
170
|
+
user_response.empty? && default || Integer(user_response)
|
|
171
|
+
rescue
|
|
172
|
+
puts 'taxid should be a number'
|
|
173
|
+
retry
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Extract FASTA file from BLAST database.
|
|
177
|
+
#
|
|
178
|
+
# Invoked while reformatting a BLAST database if the corresponding
|
|
179
|
+
# FASTA file does not exist.
|
|
180
|
+
def extract_fasta(db)
|
|
181
|
+
puts
|
|
182
|
+
puts 'Extracting sequences ...'
|
|
183
|
+
cmd = "blastdbcmd -entry all -db #{db}"
|
|
184
|
+
sys(cmd, stdout: db, path: config[:bin])
|
|
185
|
+
rescue CommandFailed => e
|
|
186
|
+
puts <<~MSG
|
|
187
|
+
Could not extract sequences from: #{db}
|
|
188
|
+
Tried: #{cmd}
|
|
189
|
+
stdout: #{e.stdout}
|
|
190
|
+
stderr: #{e.stderr}
|
|
191
|
+
MSG
|
|
192
|
+
exit!
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Returns true if the database name appears to be a multi-part database
|
|
196
|
+
# name.
|
|
197
|
+
def multipart_database_name?(db_name)
|
|
198
|
+
Database.multipart_database_name? db_name
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Returns true if first character of the file is '>'.
|
|
202
|
+
def probably_fasta?(file)
|
|
203
|
+
File.read(file, 1) == '>'
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Suggests improved titles when generating database names from files
|
|
207
|
+
# for improved apperance and readability in web interface.
|
|
208
|
+
# For example:
|
|
209
|
+
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
|
|
210
|
+
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
|
|
211
|
+
def make_db_title(db_name)
|
|
212
|
+
db_name.tr!('"', "'")
|
|
213
|
+
# removes .fasta like extension names
|
|
214
|
+
db_name.gsub!(File.extname(db_name), '')
|
|
215
|
+
# replaces _ with ' ',
|
|
216
|
+
db_name.gsub!(/(_)/, ' ')
|
|
217
|
+
# replaces '.' with ' ' when no numbers are on either side,
|
|
218
|
+
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
|
|
219
|
+
# preserves version numbers
|
|
220
|
+
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
|
|
221
|
+
db_name
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
# Guess whether FASTA file contains protein or nucleotide sequences by
|
|
225
|
+
# sampling a few few characters of the file.
|
|
226
|
+
def guess_sequence_type_in_fasta(file)
|
|
227
|
+
sequences = sample_sequences(file)
|
|
228
|
+
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
|
|
229
|
+
sequence_types = sequence_types.uniq.compact
|
|
230
|
+
(sequence_types.length == 1) && sequence_types.first
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# Read first 1,048,576 characters of the file, split the read text on
|
|
234
|
+
# fasta def line pattern and return.
|
|
235
|
+
#
|
|
236
|
+
# If the given file is FASTA, returns Array of as many different
|
|
237
|
+
# sequences in the portion of the file read. Returns the portion
|
|
238
|
+
# of the file read wrapped in an Array otherwise.
|
|
239
|
+
def sample_sequences(file)
|
|
240
|
+
File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
end
|
|
@@ -69,10 +69,16 @@ module SequenceServer
|
|
|
69
69
|
# include available databases and user-defined search options.
|
|
70
70
|
get '/searchdata.json' do
|
|
71
71
|
searchdata = {
|
|
72
|
+
query: Database.retrieve(params[:query]),
|
|
72
73
|
database: Database.all,
|
|
73
|
-
options: SequenceServer.config[:options]
|
|
74
|
-
query: Database.retrieve(params[:query])
|
|
74
|
+
options: SequenceServer.config[:options]
|
|
75
75
|
}
|
|
76
|
+
|
|
77
|
+
# If a job_id is specified, update searchdata from job meta data (i.e.,
|
|
78
|
+
# query, pre-selected databases, advanced options used). Query is only
|
|
79
|
+
# updated if params[:query] is not specified.
|
|
80
|
+
update_searchdata_from_job(searchdata) if params[:job_id]
|
|
81
|
+
|
|
76
82
|
searchdata.to_json
|
|
77
83
|
end
|
|
78
84
|
|
|
@@ -180,5 +186,25 @@ module SequenceServer
|
|
|
180
186
|
|
|
181
187
|
error_data.to_json
|
|
182
188
|
end
|
|
189
|
+
|
|
190
|
+
# Get the query sequences, selected databases, and advanced params used.
|
|
191
|
+
def update_searchdata_from_job(searchdata)
|
|
192
|
+
job = Job.fetch(params[:job_id])
|
|
193
|
+
return if job.imported_xml_file
|
|
194
|
+
|
|
195
|
+
# Only read job.qfile if we are not going to use Database.retrieve.
|
|
196
|
+
searchdata[:query] = File.read(job.qfile) if !params[:query]
|
|
197
|
+
|
|
198
|
+
# Which databases to pre-select.
|
|
199
|
+
searchdata[:preSelectedDbs] = job.databases
|
|
200
|
+
|
|
201
|
+
# job.advanced may be nil in case of old jobs. In this case, we do not
|
|
202
|
+
# override searchdata so that default advanced parameters can be applied.
|
|
203
|
+
# Note that, job.advanced will be an empty string if a user deletes the
|
|
204
|
+
# default advanced parameters from the advanced params input field. In
|
|
205
|
+
# this case, we do want the advanced params input field to be empty when
|
|
206
|
+
# the user hits the back button. Thus we do not test for empty string.
|
|
207
|
+
searchdata[:options][job.method] = [job.advanced] if job.advanced
|
|
208
|
+
end
|
|
183
209
|
end
|
|
184
210
|
end
|