sequenceserver 2.2.0 → 3.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of sequenceserver might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/COPYRIGHT.txt +1 -1
- data/bin/sequenceserver +4 -2
- data/lib/sequenceserver/blast/error.rb +53 -0
- data/lib/sequenceserver/blast/job.rb +2 -43
- data/lib/sequenceserver/job.rb +21 -11
- data/lib/sequenceserver/makeblastdb-modified-with-cache.rb +345 -0
- data/lib/sequenceserver/makeblastdb.rb +26 -12
- data/lib/sequenceserver/routes.rb +29 -3
- data/lib/sequenceserver/server.rb +1 -1
- data/lib/sequenceserver/version.rb +1 -1
- data/lib/sequenceserver.rb +3 -0
- data/public/404.html +27 -0
- data/public/config.js +0 -6
- data/public/css/grapher.css +1 -1
- data/public/css/sequenceserver.css +22 -11
- data/public/css/sequenceserver.min.css +2 -2
- data/public/js/circos.js +7 -3
- data/public/js/dnd.js +3 -3
- data/public/js/fastq_to_fasta.js +35 -0
- data/public/js/form.js +30 -11
- data/public/js/grapher.js +123 -113
- data/public/js/hit.js +8 -2
- data/public/js/hits_overview.js +4 -1
- data/public/js/jquery_world.js +0 -1
- data/public/js/kablammo.js +4 -0
- data/public/js/length_distribution.js +5 -1
- data/public/js/null_plugins/download_links.js +7 -0
- data/public/js/null_plugins/hit_buttons.js +11 -0
- data/public/js/null_plugins/report_plugins.js +18 -0
- data/public/js/query.js +26 -6
- data/public/js/report.js +33 -17
- data/public/js/search.js +0 -8
- data/public/js/sidebar.js +11 -1
- data/public/js/tests/mock_data/sequences.js +18 -1
- data/public/js/tests/search_query.spec.js +12 -3
- data/public/sequenceserver-report.min.js +76 -42
- data/public/sequenceserver-search.min.js +34 -33
- data/views/layout.erb +9 -12
- metadata +32 -23
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 68dcb3dda53edae5d095423c164568484e786b927fcf733a4722eafde7c3a155
|
4
|
+
data.tar.gz: afb985fe5e762b8a1ab9e1dc4e8dd0bc6158f601a3cc95c0029f03a8587c1fa2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb122a17858e6b7c567a418bda05358ae1a9b34ef1dfdcc11bb5f83097b47e427d7344e61e9f9102488f59f6448592b7d66e1e67ee213498e750dfea5e2e7c8f
|
7
|
+
data.tar.gz: b7b7504faf439acda18aafa0866e74f3645e431b4252bdb12ffd4896d7cf6ee065756db9e1bba027184794f306ce3b732c4911dce56d8b4a96e734dcf86c336c
|
data/COPYRIGHT.txt
CHANGED
@@ -3,7 +3,7 @@ SequenceServer is copyright Anurag Priyam, Ben J Woodcroft and Yannick Wurm,
|
|
3
3
|
version 3, the text of which can be found in LICENSE.txt.
|
4
4
|
|
5
5
|
Components of SequenceServer, including Sinatra, Ox, Slop, html5shiv,
|
6
|
-
Underscore, jQuery, jQuery UI
|
6
|
+
Underscore, jQuery, jQuery UI and Bootstrap, are licensed under
|
7
7
|
the MIT license. D3.js is licensed under BSD license. biojs-vis-sequence
|
8
8
|
is licensed under Apache license. Thin and JSON are licensed under Ruby
|
9
9
|
license. All unmodified files from these and other sources retain their
|
data/bin/sequenceserver
CHANGED
@@ -358,7 +358,9 @@ begin
|
|
358
358
|
end
|
359
359
|
|
360
360
|
if make_blast_databases?
|
361
|
-
if SequenceServer.makeblastdb.
|
361
|
+
if SequenceServer.makeblastdb.no_fastas?
|
362
|
+
puts "Couldn't find any FASTA files in #{SequenceServer.config[:database_dir]}."
|
363
|
+
elsif SequenceServer.makeblastdb.any_to_format_or_reformat?
|
362
364
|
puts
|
363
365
|
puts <<~MSG
|
364
366
|
SequenceServer has scanned your databases directory and will now offer
|
@@ -380,7 +382,7 @@ begin
|
|
380
382
|
print '>> '
|
381
383
|
response = STDIN.gets.to_s.strip
|
382
384
|
SequenceServer.makeblastdb.run unless response =~ /^[n]$/i
|
383
|
-
|
385
|
+
else
|
384
386
|
puts "All FASTA files in #{SequenceServer.config[:database_dir]} " \
|
385
387
|
'are formatted.'
|
386
388
|
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# http://www.ncbi.nlm.nih.gov/books/NBK1763/ (Appendices)
|
2
|
+
|
3
|
+
module SequenceServer
|
4
|
+
module BLAST
|
5
|
+
class Error
|
6
|
+
attr_reader :exitstatus, :stdout, :stderr
|
7
|
+
|
8
|
+
def initialize(exitstatus:, stdout:, stderr:)
|
9
|
+
@exitstatus = exitstatus
|
10
|
+
@stdout = stdout
|
11
|
+
@stderr = stderr
|
12
|
+
end
|
13
|
+
|
14
|
+
def raise!
|
15
|
+
return true if exitstatus.zero? && !File.zero?(stdout)
|
16
|
+
|
17
|
+
case exitstatus
|
18
|
+
when 1..2
|
19
|
+
# 1: Error in query sequences or options.
|
20
|
+
# 2: Error in BLAST databases.
|
21
|
+
error = IO.foreach(stderr).grep(ERROR_LINE).join
|
22
|
+
error = File.read(stderr) if error.empty?
|
23
|
+
fail InputError, "(#{exitstatus}) #{error}"
|
24
|
+
when 4
|
25
|
+
# Out of memory. User can retry with a shorter search, so raising
|
26
|
+
# InputError here instead of SystemError.
|
27
|
+
fail InputError, <<~MSG
|
28
|
+
Ran out of memory. Please try a smaller query, fewer and smaller
|
29
|
+
databases, or limiting the output by using advanced options.
|
30
|
+
MSG
|
31
|
+
when 6
|
32
|
+
# Error creating output files. It can't be a permission issue as that
|
33
|
+
# would have been caught while creating job directory. But we can run
|
34
|
+
# out of storage after creating the job directory and while running
|
35
|
+
# the job. This is a SystemError.
|
36
|
+
fail SystemError, 'Ran out of disk space.'
|
37
|
+
else
|
38
|
+
# I am not sure what the exit codes 3 means and we should not
|
39
|
+
# encounter exit code 5. The only other error that I know can happen
|
40
|
+
# but is not yet handled is when BLAST+ binaries break such as after
|
41
|
+
# macOS updates. So raise SystemError, include the exit status in the
|
42
|
+
# message, and say that that the "most likely" reason is broken BLAST+
|
43
|
+
# binaries.
|
44
|
+
|
45
|
+
error = File.read(stderr)
|
46
|
+
error = 'Most likely there is a problem with the BLAST+ binaries.' if error.empty?
|
47
|
+
|
48
|
+
fail SystemError, "BLAST failed abruptly (exit status: #{exitstatus}). #{error}"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'sequenceserver/job'
|
2
2
|
require 'sequenceserver/zip_file_generator'
|
3
|
+
require 'sequenceserver/blast/error'
|
3
4
|
|
4
5
|
module SequenceServer
|
5
6
|
# BLAST module.
|
@@ -56,46 +57,8 @@ module SequenceServer
|
|
56
57
|
" -query '#{qfile}' #{options}"
|
57
58
|
end
|
58
59
|
|
59
|
-
# Override Job#raise! to raise specific API errors based on exitstatus
|
60
|
-
# and using contents of stderr to provide context about the error.
|
61
|
-
#
|
62
60
|
def raise!
|
63
|
-
|
64
|
-
return true if exitstatus.zero? && !File.zero?(stdout)
|
65
|
-
|
66
|
-
# Handle error. See [1].
|
67
|
-
case exitstatus
|
68
|
-
when 1..2
|
69
|
-
# 1: Error in query sequences or options.
|
70
|
-
# 2: Error in BLAST databases.
|
71
|
-
error = IO.foreach(stderr).grep(ERROR_LINE).join
|
72
|
-
error = File.read(stderr) if error.empty?
|
73
|
-
fail InputError, "(#{exitstatus}) #{error}"
|
74
|
-
when 4
|
75
|
-
# Out of memory. User can retry with a shorter search, so raising
|
76
|
-
# InputError here instead of SystemError.
|
77
|
-
fail InputError, <<~MSG
|
78
|
-
Ran out of memory. Please try a smaller query, fewer and smaller
|
79
|
-
databases, or limiting the output by using advanced options.
|
80
|
-
MSG
|
81
|
-
when 6
|
82
|
-
# Error creating output files. It can't be a permission issue as that
|
83
|
-
# would have been caught while creating job directory. But we can run
|
84
|
-
# out of storage after creating the job directory and while running
|
85
|
-
# the job. This is a SystemError.
|
86
|
-
fail SystemError, 'Ran out of disk space.'
|
87
|
-
else
|
88
|
-
# I am not sure what the exit codes 3 means and we should not
|
89
|
-
# encounter exit code 5. The only other error that I know can happen
|
90
|
-
# but is not yet handled is when BLAST+ binaries break such as after
|
91
|
-
# macOS updates. So raise SystemError, include the exit status in the
|
92
|
-
# message, and say that that the "most likely" reason is broken BLAST+
|
93
|
-
# binaries.
|
94
|
-
fail SystemError, <<~MSG
|
95
|
-
BLAST failed abruptly (exit status: #{exitstatus}). Most likely there is a
|
96
|
-
problem with the BLAST+ binaries.
|
97
|
-
MSG
|
98
|
-
end
|
61
|
+
SequenceServer::BLAST::Error.new(exitstatus: exitstatus, stdout: stdout, stderr: stderr).raise!
|
99
62
|
end
|
100
63
|
|
101
64
|
# Use it with a block to get a self-cleaning temporary archive file
|
@@ -189,7 +152,3 @@ module SequenceServer
|
|
189
152
|
end
|
190
153
|
end
|
191
154
|
end
|
192
|
-
|
193
|
-
# References
|
194
|
-
# ----------
|
195
|
-
# [1]: http://www.ncbi.nlm.nih.gov/books/NBK1763/ (Appendices)
|
data/lib/sequenceserver/job.rb
CHANGED
@@ -28,16 +28,25 @@ module SequenceServer
|
|
28
28
|
enqueue(job)
|
29
29
|
end
|
30
30
|
|
31
|
+
def serializable_classes
|
32
|
+
[
|
33
|
+
Time,
|
34
|
+
Symbol,
|
35
|
+
SequenceServer::Job,
|
36
|
+
SequenceServer::BLAST::Job,
|
37
|
+
SequenceServer::Database
|
38
|
+
]
|
39
|
+
end
|
40
|
+
|
31
41
|
# Fetches job with the given id.
|
32
42
|
def fetch(id)
|
33
43
|
job_file = File.join(DOTDIR, id, 'job.yaml')
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
end
|
44
|
+
return nil unless File.exist?(job_file)
|
45
|
+
|
46
|
+
YAML.safe_load_file(
|
47
|
+
job_file,
|
48
|
+
permitted_classes: serializable_classes
|
49
|
+
)
|
41
50
|
end
|
42
51
|
|
43
52
|
# Deletes job with the given id.
|
@@ -75,8 +84,8 @@ module SequenceServer
|
|
75
84
|
# of job data will be held, yields (if block given) and saves the job.
|
76
85
|
#
|
77
86
|
# Subclasses should extend `initialize` as per requirement.
|
78
|
-
def initialize(
|
79
|
-
@id = SecureRandom.uuid
|
87
|
+
def initialize(params = {})
|
88
|
+
@id = params.fetch(:id, SecureRandom.uuid)
|
80
89
|
@submitted_at = Time.now
|
81
90
|
mkdir_p dir
|
82
91
|
yield if block_given?
|
@@ -85,7 +94,7 @@ module SequenceServer
|
|
85
94
|
raise SystemError, 'Not enough disk space to start a new job'
|
86
95
|
rescue Errno::EACCES
|
87
96
|
raise SystemError, "Permission denied to write to #{DOTDIR}"
|
88
|
-
rescue => e
|
97
|
+
rescue StandardError => e
|
89
98
|
rm_rf dir
|
90
99
|
raise e
|
91
100
|
end
|
@@ -117,7 +126,7 @@ module SequenceServer
|
|
117
126
|
# should be called on a completed job before attempting to use the results.
|
118
127
|
# Subclasses should provide their own implementation.
|
119
128
|
def raise!
|
120
|
-
|
129
|
+
fail if done? && exitstatus != 0
|
121
130
|
end
|
122
131
|
|
123
132
|
# Where will the stdout be written to during execution and read from later.
|
@@ -163,6 +172,7 @@ module SequenceServer
|
|
163
172
|
def fetch(key)
|
164
173
|
filename = File.join(dir, key)
|
165
174
|
fail unless File.exist? filename
|
175
|
+
|
166
176
|
filename
|
167
177
|
end
|
168
178
|
|
@@ -0,0 +1,345 @@
|
|
1
|
+
require 'find'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module SequenceServer
|
5
|
+
# Smart `makeblastdb` wrapper: recursively scans database directory determining
|
6
|
+
# which files need to be formatted or re-formatted.
|
7
|
+
#
|
8
|
+
# Example usage:
|
9
|
+
#
|
10
|
+
# makeblastdb = MAKEBLASTDB.new(database_dir)
|
11
|
+
# makeblastdb.scan && makeblastdb.run
|
12
|
+
#
|
13
|
+
class MAKEBLASTDB
|
14
|
+
extend Forwardable
|
15
|
+
|
16
|
+
def_delegators SequenceServer, :config, :sys, :logger
|
17
|
+
|
18
|
+
def initialize(database_dir)
|
19
|
+
@database_dir = database_dir
|
20
|
+
end
|
21
|
+
|
22
|
+
attr_reader :database_dir
|
23
|
+
attr_reader :formatted_fastas
|
24
|
+
attr_reader :fastas_to_format
|
25
|
+
attr_reader :fastas_to_reformat
|
26
|
+
|
27
|
+
# Scans the database directory to determine which FASTA files require
|
28
|
+
# formatting or re-formatting.
|
29
|
+
#
|
30
|
+
# Returns `true` if there are files to (re-)format, `false` otherwise.
|
31
|
+
def scan
|
32
|
+
# We need to know the list of formatted FASTAs as reported by blastdbcmd
|
33
|
+
# first. This is required to determine both unformatted FASTAs and those
|
34
|
+
# that require reformatting.
|
35
|
+
@formatted_fastas = []
|
36
|
+
determine_formatted_fastas
|
37
|
+
|
38
|
+
# Now determine FASTA files that are unformatted or require reformatting.
|
39
|
+
@fastas_to_format = []
|
40
|
+
determine_unformatted_fastas
|
41
|
+
@fastas_to_reformat = []
|
42
|
+
determine_fastas_to_reformat
|
43
|
+
|
44
|
+
# Return true if there are files to be (re-)formatted or false otherwise.
|
45
|
+
!@fastas_to_format.empty? || !@fastas_to_reformat.empty?
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns true if at least one database in database directory is formatted.
|
49
|
+
def any_formatted?
|
50
|
+
!@formatted_fastas.empty?
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns true if there is at least one unformatted FASTA in the databases
|
54
|
+
# directory.
|
55
|
+
def any_unformatted?
|
56
|
+
!@fastas_to_format.empty?
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns true if the databases directory contains one or more incompatible
|
60
|
+
# databases.
|
61
|
+
#
|
62
|
+
# Note that it is okay to only use V4 databases or only V5 databases.
|
63
|
+
# Incompatibility arises when they are mixed.
|
64
|
+
def any_incompatible?
|
65
|
+
return false if @formatted_fastas.all? { |ff| ff.v4? || ff.alias? }
|
66
|
+
return false if @formatted_fastas.all? { |ff| ff.v5? || ff.alias? }
|
67
|
+
true
|
68
|
+
end
|
69
|
+
|
70
|
+
# Runs makeblastdb on each file in `@fastas_to_format` and
|
71
|
+
# `@fastas_to_reformat`. Will do nothing unless `#scan`
|
72
|
+
# has been run before.
|
73
|
+
def run
|
74
|
+
format
|
75
|
+
reformat
|
76
|
+
end
|
77
|
+
|
78
|
+
# Format any unformatted FASTA files in database directory. Returns Array
|
79
|
+
# of files that were formatted.
|
80
|
+
def format
|
81
|
+
# Make the intent clear as well as ensure the program won't crash if we
|
82
|
+
# accidentally call format before calling scan.
|
83
|
+
return unless @fastas_to_format
|
84
|
+
@fastas_to_format.select do |path, title, type|
|
85
|
+
make_blast_database('format', path, title, type)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Re-format databases that require reformatting. Returns Array of files
|
90
|
+
# that were reformatted.
|
91
|
+
def reformat
|
92
|
+
# Make the intent clear as well as ensure the program won't crash if
|
93
|
+
# we accidentally call reformat before calling scan.
|
94
|
+
return unless @fastas_to_reformat
|
95
|
+
@fastas_to_reformat.select do |path, title, type, non_parse_seqids|
|
96
|
+
make_blast_database('reformat', path, title, type, non_parse_seqids)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
# Determines which FASTA files in the database directory are already
|
103
|
+
# formatted. Adds to @formatted_fastas.
|
104
|
+
def determine_formatted_fastas
|
105
|
+
blastdbcmd.each_line do |line|
|
106
|
+
path, *rest = line.chomp.split("\t")
|
107
|
+
next if multipart_database_name?(path)
|
108
|
+
rest << get_categories(path)
|
109
|
+
@formatted_fastas << Database.new(path, *rest)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# Determines which FASTA files in the database directory require
|
114
|
+
# reformatting. Adds to @fastas_to_format.
|
115
|
+
def determine_fastas_to_reformat
|
116
|
+
@formatted_fastas.each do |ff|
|
117
|
+
if ff.v4? || ff.non_parse_seqids?
|
118
|
+
@fastas_to_reformat << [ff.path, ff.title, ff.type, ff.non_parse_seqids?]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Determines which FASTA files in the database directory are
|
124
|
+
# unformatted. Adds to @fastas_to_format.
|
125
|
+
def determine_unformatted_fastas
|
126
|
+
# Add a trailing slash to database_dir - Find.find doesn't work as
|
127
|
+
# expected without the trailing slash if database_dir is a symlink
|
128
|
+
# inside a docker container.
|
129
|
+
Find.find(database_dir + '/') do |path|
|
130
|
+
next if File.directory?(path)
|
131
|
+
next unless probably_fasta?(path)
|
132
|
+
next if @formatted_fastas.any? { |f| f[0] == path }
|
133
|
+
|
134
|
+
@fastas_to_format << [path,
|
135
|
+
make_db_title(path),
|
136
|
+
guess_sequence_type_in_fasta(path)]
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Runs `blastdbcmd` to determine formatted FASTA files in the database
|
141
|
+
# directory. Returns the output of `blastdbcmd`. This method is called
|
142
|
+
# by `determine_formatted_fastas`.
|
143
|
+
def blastdbcmd
|
144
|
+
# calculate checksum of database directory
|
145
|
+
current_db_checksum = Zlib::crc32(Dir.glob(File.join(config[:database_dir], '/**/*')).map {
|
146
|
+
|path| path.to_s + "_" + File.mtime(path).to_s + "_" + File.size(path).to_s
|
147
|
+
}.to_s)
|
148
|
+
|
149
|
+
checksum_path = config[:database_dir].chomp('/') + '.checksum'
|
150
|
+
index_path = config[:database_dir].chomp('/') + '.index'
|
151
|
+
|
152
|
+
if File.exists?(checksum_path)
|
153
|
+
if current_db_checksum == File.read(checksum_path).to_i # db directory hasn't changed
|
154
|
+
if File.exists?(index_path) # lets use existing index
|
155
|
+
logger.info "Using existing database index: #{index_path}"
|
156
|
+
return File.read(index_path)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end rescue logger.error "Could not read: #{checksum_path} or #{index_path}"
|
160
|
+
|
161
|
+
# database directory has changed, or index file doesn't exist
|
162
|
+
# thus we run blastdbcmd to get formatted FASTA files
|
163
|
+
logger.info "Scanning for BLAST databases & creating index"
|
164
|
+
cmd = "blastdbcmd -recursive -list #{config[:database_dir]}" \
|
165
|
+
' -list_outfmt "%f %t %p %n %l %d %v"'
|
166
|
+
out, err = sys(cmd, path: config[:bin])
|
167
|
+
errpat = /BLAST Database error/
|
168
|
+
fail BLAST_DATABASE_ERROR.new(cmd, err) if err.match(errpat)
|
169
|
+
|
170
|
+
# write checksum and index to file
|
171
|
+
File.open(checksum_path, 'w') { |f| f.write(current_db_checksum) } rescue
|
172
|
+
logger.error "Could not write database checksum to file" + checksum_path
|
173
|
+
File.open(index_path, 'w') { |f| f.write(out) } rescue
|
174
|
+
logger.error "Could not write database index to file" + index_path
|
175
|
+
|
176
|
+
return out
|
177
|
+
rescue CommandFailed => e
|
178
|
+
fail BLAST_DATABASE_ERROR.new(cmd, e.stderr)
|
179
|
+
end
|
180
|
+
|
181
|
+
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
182
|
+
def make_blast_database(action, file, title, type, non_parse_seqids = false)
|
183
|
+
return unless make_blast_database?(action, file, type)
|
184
|
+
title = confirm_database_title(title)
|
185
|
+
extract_fasta(file) unless File.exist?(file)
|
186
|
+
taxonomy = taxid_map(file, non_parse_seqids) || taxid
|
187
|
+
_make_blast_database(file, type, title, taxonomy)
|
188
|
+
end
|
189
|
+
|
190
|
+
# Show file path and guessed sequence type to the user and obtain a y/n
|
191
|
+
# response.
|
192
|
+
#
|
193
|
+
# Returns true if the user entered anything but 'n' or 'N'.
|
194
|
+
def make_blast_database?(action, file, type)
|
195
|
+
puts
|
196
|
+
puts
|
197
|
+
puts "FASTA file to #{action}: #{file}"
|
198
|
+
puts "FASTA type: #{type}"
|
199
|
+
print 'Proceed? [y/n] (Default: y): '
|
200
|
+
response = STDIN.gets.to_s.strip
|
201
|
+
!response.match(/n/i)
|
202
|
+
end
|
203
|
+
|
204
|
+
# Show the database title that we are going to use to the user for
|
205
|
+
# confirmation.
|
206
|
+
#
|
207
|
+
# Returns user input if any. Auto-determined title otherwise.
|
208
|
+
def confirm_database_title(default)
|
209
|
+
print "Enter a database title or will use '#{default}': "
|
210
|
+
from_user = STDIN.gets.to_s.strip
|
211
|
+
from_user.empty? && default || from_user
|
212
|
+
end
|
213
|
+
|
214
|
+
# Check if a '.taxid_map.txt' file exists. If not, try getting it
|
215
|
+
# using blastdbcmd.
|
216
|
+
def taxid_map(db, non_parse_seqids)
|
217
|
+
return if non_parse_seqids
|
218
|
+
taxid_map = db.sub(/#{File.extname(db)}$/, '.taxid_map.txt')
|
219
|
+
extract_taxid_map(db, taxid_map) if !File.exist?(taxid_map)
|
220
|
+
"-taxid_map #{taxid_map}" if !File.zero?(taxid_map)
|
221
|
+
end
|
222
|
+
|
223
|
+
# Get taxid from the user. Returns user input or 0.
|
224
|
+
#
|
225
|
+
# Using 0 as taxid is equivalent to not setting taxid for the database
|
226
|
+
# that will be created.
|
227
|
+
def taxid
|
228
|
+
default = 0
|
229
|
+
print 'Enter taxid (optional): '
|
230
|
+
user_response = STDIN.gets.strip
|
231
|
+
"-taxid #{user_response.empty? && default || Integer(user_response)}"
|
232
|
+
rescue ArgumentError # presumably from call to Interger()
|
233
|
+
puts 'taxid should be a number'
|
234
|
+
retry
|
235
|
+
end
|
236
|
+
|
237
|
+
def _make_blast_database(file, type, title, taxonomy)
|
238
|
+
cmd = "makeblastdb -parse_seqids -hash_index -in '#{file}'" \
|
239
|
+
" -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
240
|
+
" #{taxonomy}"
|
241
|
+
out, err = sys(cmd, path: config[:bin])
|
242
|
+
puts out.strip
|
243
|
+
puts err.strip
|
244
|
+
return true
|
245
|
+
rescue CommandFailed => e
|
246
|
+
puts <<~MSG
|
247
|
+
Could not create BLAST database for: #{file}
|
248
|
+
Tried: #{cmd}
|
249
|
+
stdout: #{e.stdout}
|
250
|
+
stderr: #{e.stderr}
|
251
|
+
MSG
|
252
|
+
exit!
|
253
|
+
end
|
254
|
+
|
255
|
+
# Extract FASTA file from BLAST database.
|
256
|
+
#
|
257
|
+
# Invoked while reformatting a BLAST database if the corresponding
|
258
|
+
# FASTA file does not exist.
|
259
|
+
def extract_fasta(db)
|
260
|
+
puts
|
261
|
+
puts 'Extracting sequences ...'
|
262
|
+
cmd = "blastdbcmd -entry all -db #{db}"
|
263
|
+
sys(cmd, stdout: db, path: config[:bin])
|
264
|
+
rescue CommandFailed => e
|
265
|
+
puts <<~MSG
|
266
|
+
Could not extract sequences from: #{db}
|
267
|
+
Tried: #{cmd}
|
268
|
+
stdout: #{e.stdout}
|
269
|
+
stderr: #{e.stderr}
|
270
|
+
MSG
|
271
|
+
exit!
|
272
|
+
end
|
273
|
+
|
274
|
+
def extract_taxid_map(db, taxmap_file)
|
275
|
+
cmd = "blastdbcmd -entry all -db #{db} -outfmt '%i %T'"
|
276
|
+
sys(cmd, stdout: taxmap_file, path: config[:bin])
|
277
|
+
rescue CommandFailed => e
|
278
|
+
# silence error
|
279
|
+
end
|
280
|
+
|
281
|
+
# Returns true if the database name appears to be a multi-part database
|
282
|
+
# name.
|
283
|
+
#
|
284
|
+
# e.g.
|
285
|
+
# /home/ben/pd.ben/sequenceserver/db/nr.00 => yes
|
286
|
+
# /home/ben/pd.ben/sequenceserver/db/nr => no
|
287
|
+
# /home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01 => yes
|
288
|
+
# /home/ben/pd.ben/sequenceserver/db/nr00 => no
|
289
|
+
# /mnt/blast-db/refseq_genomic.100 => yes
|
290
|
+
def multipart_database_name?(db_name)
|
291
|
+
!(db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?)
|
292
|
+
end
|
293
|
+
|
294
|
+
def get_categories(path)
|
295
|
+
path
|
296
|
+
.gsub(config[:database_dir], '') # remove database_dir from path
|
297
|
+
.split('/')
|
298
|
+
.reject(&:empty?)[0..-2] # the first entry might be '' if database_dir does not end with /
|
299
|
+
end
|
300
|
+
|
301
|
+
# Returns true if first character of the file is '>'.
|
302
|
+
def probably_fasta?(file)
|
303
|
+
return false unless file.match(/((cds)|(fasta)|(fna)|(pep)|(cdna)|(fa)|(prot)|(fas)|(genome)|(nuc)|(dna)|(nt))$/i)
|
304
|
+
File.read(file, 1) == '>'
|
305
|
+
end
|
306
|
+
|
307
|
+
# Suggests improved titles when generating database names from files
|
308
|
+
# for improved apperance and readability in web interface.
|
309
|
+
# For example:
|
310
|
+
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
|
311
|
+
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
|
312
|
+
def make_db_title(path)
|
313
|
+
db_name = File.basename(path)
|
314
|
+
db_name.tr!('"', "'")
|
315
|
+
# removes .fasta like extension names
|
316
|
+
db_name.gsub!(File.extname(db_name), '')
|
317
|
+
# replaces _ with ' ',
|
318
|
+
db_name.gsub!(/(_)/, ' ')
|
319
|
+
# replaces '.' with ' ' when no numbers are on either side,
|
320
|
+
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
|
321
|
+
# preserves version numbers
|
322
|
+
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
|
323
|
+
db_name
|
324
|
+
end
|
325
|
+
|
326
|
+
# Guess whether FASTA file contains protein or nucleotide sequences by
|
327
|
+
# sampling a few few characters of the file.
|
328
|
+
def guess_sequence_type_in_fasta(file)
|
329
|
+
sequences = sample_sequences(file)
|
330
|
+
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
|
331
|
+
sequence_types = sequence_types.uniq.compact
|
332
|
+
(sequence_types.length == 1) && sequence_types.first
|
333
|
+
end
|
334
|
+
|
335
|
+
# Read first 1,048,576 characters of the file, split the read text on
|
336
|
+
# fasta def line pattern and return.
|
337
|
+
#
|
338
|
+
# If the given file is FASTA, returns Array of as many different
|
339
|
+
# sequences in the portion of the file read. Returns the portion
|
340
|
+
# of the file read wrapped in an Array otherwise.
|
341
|
+
def sample_sequences(file)
|
342
|
+
File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
end
|
@@ -30,6 +30,10 @@ module SequenceServer
|
|
30
30
|
any_to_format? || any_to_reformat?
|
31
31
|
end
|
32
32
|
|
33
|
+
def no_fastas?
|
34
|
+
probably_fastas.empty?
|
35
|
+
end
|
36
|
+
|
33
37
|
# Runs makeblastdb on each file in `fastas_to_format` and
|
34
38
|
# `fastas_to_reformat`.
|
35
39
|
def run
|
@@ -79,12 +83,12 @@ module SequenceServer
|
|
79
83
|
@formatted_fastas
|
80
84
|
end
|
81
85
|
|
82
|
-
private
|
83
|
-
|
84
86
|
def any_to_format?
|
85
87
|
fastas_to_format.any?
|
86
88
|
end
|
87
89
|
|
90
|
+
private
|
91
|
+
|
88
92
|
def any_to_reformat?
|
89
93
|
fastas_to_reformat.any?
|
90
94
|
end
|
@@ -107,22 +111,32 @@ module SequenceServer
|
|
107
111
|
def fastas_to_format
|
108
112
|
return @fastas_to_format if defined?(@fastas_to_format)
|
109
113
|
|
110
|
-
|
114
|
+
formatted_fasta_paths = formatted_fastas.map { |f| f[0] }
|
115
|
+
fasta_paths_to_format = probably_fastas - formatted_fasta_paths
|
116
|
+
|
117
|
+
@fastas_to_format = fasta_paths_to_format.map do |path|
|
118
|
+
[
|
119
|
+
path,
|
120
|
+
make_db_title(path),
|
121
|
+
guess_sequence_type_in_fasta(path)
|
122
|
+
]
|
123
|
+
end
|
124
|
+
|
125
|
+
@fastas_to_format
|
126
|
+
end
|
127
|
+
|
128
|
+
def probably_fastas
|
129
|
+
return @probably_fastas if defined?(@probably_fastas)
|
130
|
+
|
131
|
+
@probably_fastas = []
|
111
132
|
|
112
|
-
# Add a trailing slash to database_dir - Find.find doesn't work as
|
113
|
-
# expected without the trailing slash if database_dir is a symlink
|
114
|
-
# inside a docker container.
|
115
133
|
Find.find(database_dir + '/') do |path|
|
116
134
|
next if File.directory?(path)
|
117
|
-
next unless probably_fasta?(path)
|
118
|
-
next if formatted_fastas.any? { |f| f[0] == path }
|
119
135
|
|
120
|
-
@
|
121
|
-
make_db_title(path),
|
122
|
-
guess_sequence_type_in_fasta(path)]
|
136
|
+
@probably_fastas << path if probably_fasta?(path)
|
123
137
|
end
|
124
138
|
|
125
|
-
@
|
139
|
+
@probably_fastas
|
126
140
|
end
|
127
141
|
|
128
142
|
# Runs `blastdbcmd` to determine formatted FASTA files in the database
|