sequenceserver 2.2.0 → 3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/COPYRIGHT.txt +1 -1
- data/bin/sequenceserver +4 -2
- data/lib/sequenceserver/blast/error.rb +53 -0
- data/lib/sequenceserver/blast/job.rb +2 -43
- data/lib/sequenceserver/job.rb +21 -11
- data/lib/sequenceserver/makeblastdb-modified-with-cache.rb +345 -0
- data/lib/sequenceserver/makeblastdb.rb +26 -12
- data/lib/sequenceserver/routes.rb +29 -3
- data/lib/sequenceserver/server.rb +1 -1
- data/lib/sequenceserver/version.rb +1 -1
- data/lib/sequenceserver.rb +3 -0
- data/public/404.html +27 -0
- data/public/config.js +0 -6
- data/public/css/grapher.css +1 -1
- data/public/css/sequenceserver.css +22 -11
- data/public/css/sequenceserver.min.css +2 -2
- data/public/js/circos.js +7 -3
- data/public/js/dnd.js +3 -3
- data/public/js/fastq_to_fasta.js +35 -0
- data/public/js/form.js +30 -11
- data/public/js/grapher.js +123 -113
- data/public/js/hit.js +8 -2
- data/public/js/hits_overview.js +4 -1
- data/public/js/jquery_world.js +0 -1
- data/public/js/kablammo.js +4 -0
- data/public/js/length_distribution.js +5 -1
- data/public/js/null_plugins/download_links.js +7 -0
- data/public/js/null_plugins/hit_buttons.js +11 -0
- data/public/js/null_plugins/report_plugins.js +18 -0
- data/public/js/query.js +26 -6
- data/public/js/report.js +33 -17
- data/public/js/search.js +0 -8
- data/public/js/sidebar.js +11 -1
- data/public/js/tests/mock_data/sequences.js +18 -1
- data/public/js/tests/search_query.spec.js +12 -3
- data/public/sequenceserver-report.min.js +76 -42
- data/public/sequenceserver-search.min.js +34 -33
- data/views/layout.erb +9 -12
- metadata +32 -23
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 68dcb3dda53edae5d095423c164568484e786b927fcf733a4722eafde7c3a155
|
4
|
+
data.tar.gz: afb985fe5e762b8a1ab9e1dc4e8dd0bc6158f601a3cc95c0029f03a8587c1fa2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb122a17858e6b7c567a418bda05358ae1a9b34ef1dfdcc11bb5f83097b47e427d7344e61e9f9102488f59f6448592b7d66e1e67ee213498e750dfea5e2e7c8f
|
7
|
+
data.tar.gz: b7b7504faf439acda18aafa0866e74f3645e431b4252bdb12ffd4896d7cf6ee065756db9e1bba027184794f306ce3b732c4911dce56d8b4a96e734dcf86c336c
|
data/COPYRIGHT.txt
CHANGED
@@ -3,7 +3,7 @@ SequenceServer is copyright Anurag Priyam, Ben J Woodcroft and Yannick Wurm,
|
|
3
3
|
version 3, the text of which can be found in LICENSE.txt.
|
4
4
|
|
5
5
|
Components of SequenceServer, including Sinatra, Ox, Slop, html5shiv,
|
6
|
-
Underscore, jQuery, jQuery UI
|
6
|
+
Underscore, jQuery, jQuery UI and Bootstrap, are licensed under
|
7
7
|
the MIT license. D3.js is licensed under BSD license. biojs-vis-sequence
|
8
8
|
is licensed under Apache license. Thin and JSON are licensed under Ruby
|
9
9
|
license. All unmodified files from these and other sources retain their
|
data/bin/sequenceserver
CHANGED
@@ -358,7 +358,9 @@ begin
|
|
358
358
|
end
|
359
359
|
|
360
360
|
if make_blast_databases?
|
361
|
-
if SequenceServer.makeblastdb.
|
361
|
+
if SequenceServer.makeblastdb.no_fastas?
|
362
|
+
puts "Couldn't find any FASTA files in #{SequenceServer.config[:database_dir]}."
|
363
|
+
elsif SequenceServer.makeblastdb.any_to_format_or_reformat?
|
362
364
|
puts
|
363
365
|
puts <<~MSG
|
364
366
|
SequenceServer has scanned your databases directory and will now offer
|
@@ -380,7 +382,7 @@ begin
|
|
380
382
|
print '>> '
|
381
383
|
response = STDIN.gets.to_s.strip
|
382
384
|
SequenceServer.makeblastdb.run unless response =~ /^[n]$/i
|
383
|
-
|
385
|
+
else
|
384
386
|
puts "All FASTA files in #{SequenceServer.config[:database_dir]} " \
|
385
387
|
'are formatted.'
|
386
388
|
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# http://www.ncbi.nlm.nih.gov/books/NBK1763/ (Appendices)
|
2
|
+
|
3
|
+
module SequenceServer
|
4
|
+
module BLAST
|
5
|
+
class Error
|
6
|
+
attr_reader :exitstatus, :stdout, :stderr
|
7
|
+
|
8
|
+
def initialize(exitstatus:, stdout:, stderr:)
|
9
|
+
@exitstatus = exitstatus
|
10
|
+
@stdout = stdout
|
11
|
+
@stderr = stderr
|
12
|
+
end
|
13
|
+
|
14
|
+
def raise!
|
15
|
+
return true if exitstatus.zero? && !File.zero?(stdout)
|
16
|
+
|
17
|
+
case exitstatus
|
18
|
+
when 1..2
|
19
|
+
# 1: Error in query sequences or options.
|
20
|
+
# 2: Error in BLAST databases.
|
21
|
+
error = IO.foreach(stderr).grep(ERROR_LINE).join
|
22
|
+
error = File.read(stderr) if error.empty?
|
23
|
+
fail InputError, "(#{exitstatus}) #{error}"
|
24
|
+
when 4
|
25
|
+
# Out of memory. User can retry with a shorter search, so raising
|
26
|
+
# InputError here instead of SystemError.
|
27
|
+
fail InputError, <<~MSG
|
28
|
+
Ran out of memory. Please try a smaller query, fewer and smaller
|
29
|
+
databases, or limiting the output by using advanced options.
|
30
|
+
MSG
|
31
|
+
when 6
|
32
|
+
# Error creating output files. It can't be a permission issue as that
|
33
|
+
# would have been caught while creating job directory. But we can run
|
34
|
+
# out of storage after creating the job directory and while running
|
35
|
+
# the job. This is a SystemError.
|
36
|
+
fail SystemError, 'Ran out of disk space.'
|
37
|
+
else
|
38
|
+
# I am not sure what the exit codes 3 means and we should not
|
39
|
+
# encounter exit code 5. The only other error that I know can happen
|
40
|
+
# but is not yet handled is when BLAST+ binaries break such as after
|
41
|
+
# macOS updates. So raise SystemError, include the exit status in the
|
42
|
+
# message, and say that that the "most likely" reason is broken BLAST+
|
43
|
+
# binaries.
|
44
|
+
|
45
|
+
error = File.read(stderr)
|
46
|
+
error = 'Most likely there is a problem with the BLAST+ binaries.' if error.empty?
|
47
|
+
|
48
|
+
fail SystemError, "BLAST failed abruptly (exit status: #{exitstatus}). #{error}"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'sequenceserver/job'
|
2
2
|
require 'sequenceserver/zip_file_generator'
|
3
|
+
require 'sequenceserver/blast/error'
|
3
4
|
|
4
5
|
module SequenceServer
|
5
6
|
# BLAST module.
|
@@ -56,46 +57,8 @@ module SequenceServer
|
|
56
57
|
" -query '#{qfile}' #{options}"
|
57
58
|
end
|
58
59
|
|
59
|
-
# Override Job#raise! to raise specific API errors based on exitstatus
|
60
|
-
# and using contents of stderr to provide context about the error.
|
61
|
-
#
|
62
60
|
def raise!
|
63
|
-
|
64
|
-
return true if exitstatus.zero? && !File.zero?(stdout)
|
65
|
-
|
66
|
-
# Handle error. See [1].
|
67
|
-
case exitstatus
|
68
|
-
when 1..2
|
69
|
-
# 1: Error in query sequences or options.
|
70
|
-
# 2: Error in BLAST databases.
|
71
|
-
error = IO.foreach(stderr).grep(ERROR_LINE).join
|
72
|
-
error = File.read(stderr) if error.empty?
|
73
|
-
fail InputError, "(#{exitstatus}) #{error}"
|
74
|
-
when 4
|
75
|
-
# Out of memory. User can retry with a shorter search, so raising
|
76
|
-
# InputError here instead of SystemError.
|
77
|
-
fail InputError, <<~MSG
|
78
|
-
Ran out of memory. Please try a smaller query, fewer and smaller
|
79
|
-
databases, or limiting the output by using advanced options.
|
80
|
-
MSG
|
81
|
-
when 6
|
82
|
-
# Error creating output files. It can't be a permission issue as that
|
83
|
-
# would have been caught while creating job directory. But we can run
|
84
|
-
# out of storage after creating the job directory and while running
|
85
|
-
# the job. This is a SystemError.
|
86
|
-
fail SystemError, 'Ran out of disk space.'
|
87
|
-
else
|
88
|
-
# I am not sure what the exit codes 3 means and we should not
|
89
|
-
# encounter exit code 5. The only other error that I know can happen
|
90
|
-
# but is not yet handled is when BLAST+ binaries break such as after
|
91
|
-
# macOS updates. So raise SystemError, include the exit status in the
|
92
|
-
# message, and say that that the "most likely" reason is broken BLAST+
|
93
|
-
# binaries.
|
94
|
-
fail SystemError, <<~MSG
|
95
|
-
BLAST failed abruptly (exit status: #{exitstatus}). Most likely there is a
|
96
|
-
problem with the BLAST+ binaries.
|
97
|
-
MSG
|
98
|
-
end
|
61
|
+
SequenceServer::BLAST::Error.new(exitstatus: exitstatus, stdout: stdout, stderr: stderr).raise!
|
99
62
|
end
|
100
63
|
|
101
64
|
# Use it with a block to get a self-cleaning temporary archive file
|
@@ -189,7 +152,3 @@ module SequenceServer
|
|
189
152
|
end
|
190
153
|
end
|
191
154
|
end
|
192
|
-
|
193
|
-
# References
|
194
|
-
# ----------
|
195
|
-
# [1]: http://www.ncbi.nlm.nih.gov/books/NBK1763/ (Appendices)
|
data/lib/sequenceserver/job.rb
CHANGED
@@ -28,16 +28,25 @@ module SequenceServer
|
|
28
28
|
enqueue(job)
|
29
29
|
end
|
30
30
|
|
31
|
+
def serializable_classes
|
32
|
+
[
|
33
|
+
Time,
|
34
|
+
Symbol,
|
35
|
+
SequenceServer::Job,
|
36
|
+
SequenceServer::BLAST::Job,
|
37
|
+
SequenceServer::Database
|
38
|
+
]
|
39
|
+
end
|
40
|
+
|
31
41
|
# Fetches job with the given id.
|
32
42
|
def fetch(id)
|
33
43
|
job_file = File.join(DOTDIR, id, 'job.yaml')
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
end
|
44
|
+
return nil unless File.exist?(job_file)
|
45
|
+
|
46
|
+
YAML.safe_load_file(
|
47
|
+
job_file,
|
48
|
+
permitted_classes: serializable_classes
|
49
|
+
)
|
41
50
|
end
|
42
51
|
|
43
52
|
# Deletes job with the given id.
|
@@ -75,8 +84,8 @@ module SequenceServer
|
|
75
84
|
# of job data will be held, yields (if block given) and saves the job.
|
76
85
|
#
|
77
86
|
# Subclasses should extend `initialize` as per requirement.
|
78
|
-
def initialize(
|
79
|
-
@id = SecureRandom.uuid
|
87
|
+
def initialize(params = {})
|
88
|
+
@id = params.fetch(:id, SecureRandom.uuid)
|
80
89
|
@submitted_at = Time.now
|
81
90
|
mkdir_p dir
|
82
91
|
yield if block_given?
|
@@ -85,7 +94,7 @@ module SequenceServer
|
|
85
94
|
raise SystemError, 'Not enough disk space to start a new job'
|
86
95
|
rescue Errno::EACCES
|
87
96
|
raise SystemError, "Permission denied to write to #{DOTDIR}"
|
88
|
-
rescue => e
|
97
|
+
rescue StandardError => e
|
89
98
|
rm_rf dir
|
90
99
|
raise e
|
91
100
|
end
|
@@ -117,7 +126,7 @@ module SequenceServer
|
|
117
126
|
# should be called on a completed job before attempting to use the results.
|
118
127
|
# Subclasses should provide their own implementation.
|
119
128
|
def raise!
|
120
|
-
|
129
|
+
fail if done? && exitstatus != 0
|
121
130
|
end
|
122
131
|
|
123
132
|
# Where will the stdout be written to during execution and read from later.
|
@@ -163,6 +172,7 @@ module SequenceServer
|
|
163
172
|
def fetch(key)
|
164
173
|
filename = File.join(dir, key)
|
165
174
|
fail unless File.exist? filename
|
175
|
+
|
166
176
|
filename
|
167
177
|
end
|
168
178
|
|
@@ -0,0 +1,345 @@
|
|
1
|
+
require 'find'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module SequenceServer
|
5
|
+
# Smart `makeblastdb` wrapper: recursively scans database directory determining
|
6
|
+
# which files need to be formatted or re-formatted.
|
7
|
+
#
|
8
|
+
# Example usage:
|
9
|
+
#
|
10
|
+
# makeblastdb = MAKEBLASTDB.new(database_dir)
|
11
|
+
# makeblastdb.scan && makeblastdb.run
|
12
|
+
#
|
13
|
+
class MAKEBLASTDB
|
14
|
+
extend Forwardable
|
15
|
+
|
16
|
+
def_delegators SequenceServer, :config, :sys, :logger
|
17
|
+
|
18
|
+
def initialize(database_dir)
|
19
|
+
@database_dir = database_dir
|
20
|
+
end
|
21
|
+
|
22
|
+
attr_reader :database_dir
|
23
|
+
attr_reader :formatted_fastas
|
24
|
+
attr_reader :fastas_to_format
|
25
|
+
attr_reader :fastas_to_reformat
|
26
|
+
|
27
|
+
# Scans the database directory to determine which FASTA files require
|
28
|
+
# formatting or re-formatting.
|
29
|
+
#
|
30
|
+
# Returns `true` if there are files to (re-)format, `false` otherwise.
|
31
|
+
def scan
|
32
|
+
# We need to know the list of formatted FASTAs as reported by blastdbcmd
|
33
|
+
# first. This is required to determine both unformatted FASTAs and those
|
34
|
+
# that require reformatting.
|
35
|
+
@formatted_fastas = []
|
36
|
+
determine_formatted_fastas
|
37
|
+
|
38
|
+
# Now determine FASTA files that are unformatted or require reformatting.
|
39
|
+
@fastas_to_format = []
|
40
|
+
determine_unformatted_fastas
|
41
|
+
@fastas_to_reformat = []
|
42
|
+
determine_fastas_to_reformat
|
43
|
+
|
44
|
+
# Return true if there are files to be (re-)formatted or false otherwise.
|
45
|
+
!@fastas_to_format.empty? || !@fastas_to_reformat.empty?
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns true if at least one database in database directory is formatted.
|
49
|
+
def any_formatted?
|
50
|
+
!@formatted_fastas.empty?
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns true if there is at least one unformatted FASTA in the databases
|
54
|
+
# directory.
|
55
|
+
def any_unformatted?
|
56
|
+
!@fastas_to_format.empty?
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns true if the databases directory contains one or more incompatible
|
60
|
+
# databases.
|
61
|
+
#
|
62
|
+
# Note that it is okay to only use V4 databases or only V5 databases.
|
63
|
+
# Incompatibility arises when they are mixed.
|
64
|
+
def any_incompatible?
|
65
|
+
return false if @formatted_fastas.all? { |ff| ff.v4? || ff.alias? }
|
66
|
+
return false if @formatted_fastas.all? { |ff| ff.v5? || ff.alias? }
|
67
|
+
true
|
68
|
+
end
|
69
|
+
|
70
|
+
# Runs makeblastdb on each file in `@fastas_to_format` and
|
71
|
+
# `@fastas_to_reformat`. Will do nothing unless `#scan`
|
72
|
+
# has been run before.
|
73
|
+
def run
|
74
|
+
format
|
75
|
+
reformat
|
76
|
+
end
|
77
|
+
|
78
|
+
# Format any unformatted FASTA files in database directory. Returns Array
|
79
|
+
# of files that were formatted.
|
80
|
+
def format
|
81
|
+
# Make the intent clear as well as ensure the program won't crash if we
|
82
|
+
# accidentally call format before calling scan.
|
83
|
+
return unless @fastas_to_format
|
84
|
+
@fastas_to_format.select do |path, title, type|
|
85
|
+
make_blast_database('format', path, title, type)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Re-format databases that require reformatting. Returns Array of files
|
90
|
+
# that were reformatted.
|
91
|
+
def reformat
|
92
|
+
# Make the intent clear as well as ensure the program won't crash if
|
93
|
+
# we accidentally call reformat before calling scan.
|
94
|
+
return unless @fastas_to_reformat
|
95
|
+
@fastas_to_reformat.select do |path, title, type, non_parse_seqids|
|
96
|
+
make_blast_database('reformat', path, title, type, non_parse_seqids)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
# Determines which FASTA files in the database directory are already
|
103
|
+
# formatted. Adds to @formatted_fastas.
|
104
|
+
def determine_formatted_fastas
|
105
|
+
blastdbcmd.each_line do |line|
|
106
|
+
path, *rest = line.chomp.split("\t")
|
107
|
+
next if multipart_database_name?(path)
|
108
|
+
rest << get_categories(path)
|
109
|
+
@formatted_fastas << Database.new(path, *rest)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# Determines which FASTA files in the database directory require
|
114
|
+
# reformatting. Adds to @fastas_to_format.
|
115
|
+
def determine_fastas_to_reformat
|
116
|
+
@formatted_fastas.each do |ff|
|
117
|
+
if ff.v4? || ff.non_parse_seqids?
|
118
|
+
@fastas_to_reformat << [ff.path, ff.title, ff.type, ff.non_parse_seqids?]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Determines which FASTA files in the database directory are
|
124
|
+
# unformatted. Adds to @fastas_to_format.
|
125
|
+
def determine_unformatted_fastas
|
126
|
+
# Add a trailing slash to database_dir - Find.find doesn't work as
|
127
|
+
# expected without the trailing slash if database_dir is a symlink
|
128
|
+
# inside a docker container.
|
129
|
+
Find.find(database_dir + '/') do |path|
|
130
|
+
next if File.directory?(path)
|
131
|
+
next unless probably_fasta?(path)
|
132
|
+
next if @formatted_fastas.any? { |f| f[0] == path }
|
133
|
+
|
134
|
+
@fastas_to_format << [path,
|
135
|
+
make_db_title(path),
|
136
|
+
guess_sequence_type_in_fasta(path)]
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Runs `blastdbcmd` to determine formatted FASTA files in the database
|
141
|
+
# directory. Returns the output of `blastdbcmd`. This method is called
|
142
|
+
# by `determine_formatted_fastas`.
|
143
|
+
def blastdbcmd
|
144
|
+
# calculate checksum of database directory
|
145
|
+
current_db_checksum = Zlib::crc32(Dir.glob(File.join(config[:database_dir], '/**/*')).map {
|
146
|
+
|path| path.to_s + "_" + File.mtime(path).to_s + "_" + File.size(path).to_s
|
147
|
+
}.to_s)
|
148
|
+
|
149
|
+
checksum_path = config[:database_dir].chomp('/') + '.checksum'
|
150
|
+
index_path = config[:database_dir].chomp('/') + '.index'
|
151
|
+
|
152
|
+
if File.exists?(checksum_path)
|
153
|
+
if current_db_checksum == File.read(checksum_path).to_i # db directory hasn't changed
|
154
|
+
if File.exists?(index_path) # lets use existing index
|
155
|
+
logger.info "Using existing database index: #{index_path}"
|
156
|
+
return File.read(index_path)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end rescue logger.error "Could not read: #{checksum_path} or #{index_path}"
|
160
|
+
|
161
|
+
# database directory has changed, or index file doesn't exist
|
162
|
+
# thus we run blastdbcmd to get formatted FASTA files
|
163
|
+
logger.info "Scanning for BLAST databases & creating index"
|
164
|
+
cmd = "blastdbcmd -recursive -list #{config[:database_dir]}" \
|
165
|
+
' -list_outfmt "%f %t %p %n %l %d %v"'
|
166
|
+
out, err = sys(cmd, path: config[:bin])
|
167
|
+
errpat = /BLAST Database error/
|
168
|
+
fail BLAST_DATABASE_ERROR.new(cmd, err) if err.match(errpat)
|
169
|
+
|
170
|
+
# write checksum and index to file
|
171
|
+
File.open(checksum_path, 'w') { |f| f.write(current_db_checksum) } rescue
|
172
|
+
logger.error "Could not write database checksum to file" + checksum_path
|
173
|
+
File.open(index_path, 'w') { |f| f.write(out) } rescue
|
174
|
+
logger.error "Could not write database index to file" + index_path
|
175
|
+
|
176
|
+
return out
|
177
|
+
rescue CommandFailed => e
|
178
|
+
fail BLAST_DATABASE_ERROR.new(cmd, e.stderr)
|
179
|
+
end
|
180
|
+
|
181
|
+
# Create BLAST database, given FASTA file and sequence type in FASTA file.
|
182
|
+
def make_blast_database(action, file, title, type, non_parse_seqids = false)
|
183
|
+
return unless make_blast_database?(action, file, type)
|
184
|
+
title = confirm_database_title(title)
|
185
|
+
extract_fasta(file) unless File.exist?(file)
|
186
|
+
taxonomy = taxid_map(file, non_parse_seqids) || taxid
|
187
|
+
_make_blast_database(file, type, title, taxonomy)
|
188
|
+
end
|
189
|
+
|
190
|
+
# Show file path and guessed sequence type to the user and obtain a y/n
|
191
|
+
# response.
|
192
|
+
#
|
193
|
+
# Returns true if the user entered anything but 'n' or 'N'.
|
194
|
+
def make_blast_database?(action, file, type)
|
195
|
+
puts
|
196
|
+
puts
|
197
|
+
puts "FASTA file to #{action}: #{file}"
|
198
|
+
puts "FASTA type: #{type}"
|
199
|
+
print 'Proceed? [y/n] (Default: y): '
|
200
|
+
response = STDIN.gets.to_s.strip
|
201
|
+
!response.match(/n/i)
|
202
|
+
end
|
203
|
+
|
204
|
+
# Show the database title that we are going to use to the user for
|
205
|
+
# confirmation.
|
206
|
+
#
|
207
|
+
# Returns user input if any. Auto-determined title otherwise.
|
208
|
+
def confirm_database_title(default)
|
209
|
+
print "Enter a database title or will use '#{default}': "
|
210
|
+
from_user = STDIN.gets.to_s.strip
|
211
|
+
from_user.empty? && default || from_user
|
212
|
+
end
|
213
|
+
|
214
|
+
# Check if a '.taxid_map.txt' file exists. If not, try getting it
|
215
|
+
# using blastdbcmd.
|
216
|
+
def taxid_map(db, non_parse_seqids)
|
217
|
+
return if non_parse_seqids
|
218
|
+
taxid_map = db.sub(/#{File.extname(db)}$/, '.taxid_map.txt')
|
219
|
+
extract_taxid_map(db, taxid_map) if !File.exist?(taxid_map)
|
220
|
+
"-taxid_map #{taxid_map}" if !File.zero?(taxid_map)
|
221
|
+
end
|
222
|
+
|
223
|
+
# Get taxid from the user. Returns user input or 0.
|
224
|
+
#
|
225
|
+
# Using 0 as taxid is equivalent to not setting taxid for the database
|
226
|
+
# that will be created.
|
227
|
+
def taxid
|
228
|
+
default = 0
|
229
|
+
print 'Enter taxid (optional): '
|
230
|
+
user_response = STDIN.gets.strip
|
231
|
+
"-taxid #{user_response.empty? && default || Integer(user_response)}"
|
232
|
+
rescue ArgumentError # presumably from call to Interger()
|
233
|
+
puts 'taxid should be a number'
|
234
|
+
retry
|
235
|
+
end
|
236
|
+
|
237
|
+
def _make_blast_database(file, type, title, taxonomy)
|
238
|
+
cmd = "makeblastdb -parse_seqids -hash_index -in '#{file}'" \
|
239
|
+
" -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
|
240
|
+
" #{taxonomy}"
|
241
|
+
out, err = sys(cmd, path: config[:bin])
|
242
|
+
puts out.strip
|
243
|
+
puts err.strip
|
244
|
+
return true
|
245
|
+
rescue CommandFailed => e
|
246
|
+
puts <<~MSG
|
247
|
+
Could not create BLAST database for: #{file}
|
248
|
+
Tried: #{cmd}
|
249
|
+
stdout: #{e.stdout}
|
250
|
+
stderr: #{e.stderr}
|
251
|
+
MSG
|
252
|
+
exit!
|
253
|
+
end
|
254
|
+
|
255
|
+
# Extract FASTA file from BLAST database.
|
256
|
+
#
|
257
|
+
# Invoked while reformatting a BLAST database if the corresponding
|
258
|
+
# FASTA file does not exist.
|
259
|
+
def extract_fasta(db)
|
260
|
+
puts
|
261
|
+
puts 'Extracting sequences ...'
|
262
|
+
cmd = "blastdbcmd -entry all -db #{db}"
|
263
|
+
sys(cmd, stdout: db, path: config[:bin])
|
264
|
+
rescue CommandFailed => e
|
265
|
+
puts <<~MSG
|
266
|
+
Could not extract sequences from: #{db}
|
267
|
+
Tried: #{cmd}
|
268
|
+
stdout: #{e.stdout}
|
269
|
+
stderr: #{e.stderr}
|
270
|
+
MSG
|
271
|
+
exit!
|
272
|
+
end
|
273
|
+
|
274
|
+
def extract_taxid_map(db, taxmap_file)
|
275
|
+
cmd = "blastdbcmd -entry all -db #{db} -outfmt '%i %T'"
|
276
|
+
sys(cmd, stdout: taxmap_file, path: config[:bin])
|
277
|
+
rescue CommandFailed => e
|
278
|
+
# silence error
|
279
|
+
end
|
280
|
+
|
281
|
+
# Returns true if the database name appears to be a multi-part database
|
282
|
+
# name.
|
283
|
+
#
|
284
|
+
# e.g.
|
285
|
+
# /home/ben/pd.ben/sequenceserver/db/nr.00 => yes
|
286
|
+
# /home/ben/pd.ben/sequenceserver/db/nr => no
|
287
|
+
# /home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01 => yes
|
288
|
+
# /home/ben/pd.ben/sequenceserver/db/nr00 => no
|
289
|
+
# /mnt/blast-db/refseq_genomic.100 => yes
|
290
|
+
def multipart_database_name?(db_name)
|
291
|
+
!(db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?)
|
292
|
+
end
|
293
|
+
|
294
|
+
def get_categories(path)
|
295
|
+
path
|
296
|
+
.gsub(config[:database_dir], '') # remove database_dir from path
|
297
|
+
.split('/')
|
298
|
+
.reject(&:empty?)[0..-2] # the first entry might be '' if database_dir does not end with /
|
299
|
+
end
|
300
|
+
|
301
|
+
# Returns true if first character of the file is '>'.
|
302
|
+
def probably_fasta?(file)
|
303
|
+
return false unless file.match(/((cds)|(fasta)|(fna)|(pep)|(cdna)|(fa)|(prot)|(fas)|(genome)|(nuc)|(dna)|(nt))$/i)
|
304
|
+
File.read(file, 1) == '>'
|
305
|
+
end
|
306
|
+
|
307
|
+
# Suggests improved titles when generating database names from files
|
308
|
+
# for improved apperance and readability in web interface.
|
309
|
+
# For example:
|
310
|
+
# Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
|
311
|
+
# S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
|
312
|
+
def make_db_title(path)
|
313
|
+
db_name = File.basename(path)
|
314
|
+
db_name.tr!('"', "'")
|
315
|
+
# removes .fasta like extension names
|
316
|
+
db_name.gsub!(File.extname(db_name), '')
|
317
|
+
# replaces _ with ' ',
|
318
|
+
db_name.gsub!(/(_)/, ' ')
|
319
|
+
# replaces '.' with ' ' when no numbers are on either side,
|
320
|
+
db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
|
321
|
+
# preserves version numbers
|
322
|
+
db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
|
323
|
+
db_name
|
324
|
+
end
|
325
|
+
|
326
|
+
# Guess whether FASTA file contains protein or nucleotide sequences by
|
327
|
+
# sampling a few few characters of the file.
|
328
|
+
def guess_sequence_type_in_fasta(file)
|
329
|
+
sequences = sample_sequences(file)
|
330
|
+
sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
|
331
|
+
sequence_types = sequence_types.uniq.compact
|
332
|
+
(sequence_types.length == 1) && sequence_types.first
|
333
|
+
end
|
334
|
+
|
335
|
+
# Read first 1,048,576 characters of the file, split the read text on
|
336
|
+
# fasta def line pattern and return.
|
337
|
+
#
|
338
|
+
# If the given file is FASTA, returns Array of as many different
|
339
|
+
# sequences in the portion of the file read. Returns the portion
|
340
|
+
# of the file read wrapped in an Array otherwise.
|
341
|
+
def sample_sequences(file)
|
342
|
+
File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
end
|
@@ -30,6 +30,10 @@ module SequenceServer
|
|
30
30
|
any_to_format? || any_to_reformat?
|
31
31
|
end
|
32
32
|
|
33
|
+
def no_fastas?
|
34
|
+
probably_fastas.empty?
|
35
|
+
end
|
36
|
+
|
33
37
|
# Runs makeblastdb on each file in `fastas_to_format` and
|
34
38
|
# `fastas_to_reformat`.
|
35
39
|
def run
|
@@ -79,12 +83,12 @@ module SequenceServer
|
|
79
83
|
@formatted_fastas
|
80
84
|
end
|
81
85
|
|
82
|
-
private
|
83
|
-
|
84
86
|
def any_to_format?
|
85
87
|
fastas_to_format.any?
|
86
88
|
end
|
87
89
|
|
90
|
+
private
|
91
|
+
|
88
92
|
def any_to_reformat?
|
89
93
|
fastas_to_reformat.any?
|
90
94
|
end
|
@@ -107,22 +111,32 @@ module SequenceServer
|
|
107
111
|
def fastas_to_format
|
108
112
|
return @fastas_to_format if defined?(@fastas_to_format)
|
109
113
|
|
110
|
-
|
114
|
+
formatted_fasta_paths = formatted_fastas.map { |f| f[0] }
|
115
|
+
fasta_paths_to_format = probably_fastas - formatted_fasta_paths
|
116
|
+
|
117
|
+
@fastas_to_format = fasta_paths_to_format.map do |path|
|
118
|
+
[
|
119
|
+
path,
|
120
|
+
make_db_title(path),
|
121
|
+
guess_sequence_type_in_fasta(path)
|
122
|
+
]
|
123
|
+
end
|
124
|
+
|
125
|
+
@fastas_to_format
|
126
|
+
end
|
127
|
+
|
128
|
+
def probably_fastas
|
129
|
+
return @probably_fastas if defined?(@probably_fastas)
|
130
|
+
|
131
|
+
@probably_fastas = []
|
111
132
|
|
112
|
-
# Add a trailing slash to database_dir - Find.find doesn't work as
|
113
|
-
# expected without the trailing slash if database_dir is a symlink
|
114
|
-
# inside a docker container.
|
115
133
|
Find.find(database_dir + '/') do |path|
|
116
134
|
next if File.directory?(path)
|
117
|
-
next unless probably_fasta?(path)
|
118
|
-
next if formatted_fastas.any? { |f| f[0] == path }
|
119
135
|
|
120
|
-
@
|
121
|
-
make_db_title(path),
|
122
|
-
guess_sequence_type_in_fasta(path)]
|
136
|
+
@probably_fastas << path if probably_fasta?(path)
|
123
137
|
end
|
124
138
|
|
125
|
-
@
|
139
|
+
@probably_fastas
|
126
140
|
end
|
127
141
|
|
128
142
|
# Runs `blastdbcmd` to determine formatted FASTA files in the database
|