sequenceserver 2.2.0 → 3.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sequenceserver might be problematic. Click here for more details.

Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/COPYRIGHT.txt +1 -1
  3. data/bin/sequenceserver +4 -2
  4. data/lib/sequenceserver/blast/error.rb +53 -0
  5. data/lib/sequenceserver/blast/job.rb +2 -43
  6. data/lib/sequenceserver/job.rb +21 -11
  7. data/lib/sequenceserver/makeblastdb-modified-with-cache.rb +345 -0
  8. data/lib/sequenceserver/makeblastdb.rb +26 -12
  9. data/lib/sequenceserver/routes.rb +29 -3
  10. data/lib/sequenceserver/server.rb +1 -1
  11. data/lib/sequenceserver/version.rb +1 -1
  12. data/lib/sequenceserver.rb +3 -0
  13. data/public/404.html +27 -0
  14. data/public/config.js +0 -6
  15. data/public/css/grapher.css +1 -1
  16. data/public/css/sequenceserver.css +22 -11
  17. data/public/css/sequenceserver.min.css +2 -2
  18. data/public/js/circos.js +7 -3
  19. data/public/js/dnd.js +3 -3
  20. data/public/js/fastq_to_fasta.js +35 -0
  21. data/public/js/form.js +30 -11
  22. data/public/js/grapher.js +123 -113
  23. data/public/js/hit.js +8 -2
  24. data/public/js/hits_overview.js +4 -1
  25. data/public/js/jquery_world.js +0 -1
  26. data/public/js/kablammo.js +4 -0
  27. data/public/js/length_distribution.js +5 -1
  28. data/public/js/null_plugins/download_links.js +7 -0
  29. data/public/js/null_plugins/hit_buttons.js +11 -0
  30. data/public/js/null_plugins/report_plugins.js +18 -0
  31. data/public/js/query.js +26 -6
  32. data/public/js/report.js +33 -17
  33. data/public/js/search.js +0 -8
  34. data/public/js/sidebar.js +11 -1
  35. data/public/js/tests/mock_data/sequences.js +18 -1
  36. data/public/js/tests/search_query.spec.js +12 -3
  37. data/public/sequenceserver-report.min.js +76 -42
  38. data/public/sequenceserver-search.min.js +34 -33
  39. data/views/layout.erb +9 -12
  40. metadata +32 -23
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 61b15f95cdc065739de70f2ba94d3022e6beec789dd9c64199ee1be3a2459c63
4
- data.tar.gz: d6c108bd39cb8c6a832787d2cbaa52b2c5a3653c0b08b289cb149b7221a9870c
3
+ metadata.gz: 68dcb3dda53edae5d095423c164568484e786b927fcf733a4722eafde7c3a155
4
+ data.tar.gz: afb985fe5e762b8a1ab9e1dc4e8dd0bc6158f601a3cc95c0029f03a8587c1fa2
5
5
  SHA512:
6
- metadata.gz: d4c2b1785980dd4e46a24a8a5a6d36652fb23f5389f9b7320cd9360f081cbad82b6dc6bfe735b5321738611094052631af0de635b77676587d703d466bd5f604
7
- data.tar.gz: 235e35e8ba4331894a7ac9f107d950462d3b3d892ddafc6f58d3ba48fbe6a03cfc016ac64798921472e296a0cb5577245aca49cbb517d897094a501ec4483beb
6
+ metadata.gz: fb122a17858e6b7c567a418bda05358ae1a9b34ef1dfdcc11bb5f83097b47e427d7344e61e9f9102488f59f6448592b7d66e1e67ee213498e750dfea5e2e7c8f
7
+ data.tar.gz: b7b7504faf439acda18aafa0866e74f3645e431b4252bdb12ffd4896d7cf6ee065756db9e1bba027184794f306ce3b732c4911dce56d8b4a96e734dcf86c336c
data/COPYRIGHT.txt CHANGED
@@ -3,7 +3,7 @@ SequenceServer is copyright Anurag Priyam, Ben J Woodcroft and Yannick Wurm,
3
3
  version 3, the text of which can be found in LICENSE.txt.
4
4
 
5
5
  Components of SequenceServer, including Sinatra, Ox, Slop, html5shiv,
6
- Underscore, jQuery, jQuery UI, Bootstrap and Webshim, are licensed under
6
+ Underscore, jQuery, jQuery UI and Bootstrap, are licensed under
7
7
  the MIT license. D3.js is licensed under BSD license. biojs-vis-sequence
8
8
  is licensed under Apache license. Thin and JSON are licensed under Ruby
9
9
  license. All unmodified files from these and other sources retain their
data/bin/sequenceserver CHANGED
@@ -358,7 +358,9 @@ begin
358
358
  end
359
359
 
360
360
  if make_blast_databases?
361
- if SequenceServer.makeblastdb.any_to_format_or_reformat?
361
+ if SequenceServer.makeblastdb.no_fastas?
362
+ puts "Couldn't find any FASTA files in #{SequenceServer.config[:database_dir]}."
363
+ elsif SequenceServer.makeblastdb.any_to_format_or_reformat?
362
364
  puts
363
365
  puts <<~MSG
364
366
  SequenceServer has scanned your databases directory and will now offer
@@ -380,7 +382,7 @@ begin
380
382
  print '>> '
381
383
  response = STDIN.gets.to_s.strip
382
384
  SequenceServer.makeblastdb.run unless response =~ /^[n]$/i
383
- else
385
+ else
384
386
  puts "All FASTA files in #{SequenceServer.config[:database_dir]} " \
385
387
  'are formatted.'
386
388
  end
@@ -0,0 +1,53 @@
1
+ # http://www.ncbi.nlm.nih.gov/books/NBK1763/ (Appendices)
2
+
3
+ module SequenceServer
4
+ module BLAST
5
+ class Error
6
+ attr_reader :exitstatus, :stdout, :stderr
7
+
8
+ def initialize(exitstatus:, stdout:, stderr:)
9
+ @exitstatus = exitstatus
10
+ @stdout = stdout
11
+ @stderr = stderr
12
+ end
13
+
14
+ def raise!
15
+ return true if exitstatus.zero? && !File.zero?(stdout)
16
+
17
+ case exitstatus
18
+ when 1..2
19
+ # 1: Error in query sequences or options.
20
+ # 2: Error in BLAST databases.
21
+ error = IO.foreach(stderr).grep(ERROR_LINE).join
22
+ error = File.read(stderr) if error.empty?
23
+ fail InputError, "(#{exitstatus}) #{error}"
24
+ when 4
25
+ # Out of memory. User can retry with a shorter search, so raising
26
+ # InputError here instead of SystemError.
27
+ fail InputError, <<~MSG
28
+ Ran out of memory. Please try a smaller query, fewer and smaller
29
+ databases, or limiting the output by using advanced options.
30
+ MSG
31
+ when 6
32
+ # Error creating output files. It can't be a permission issue as that
33
+ # would have been caught while creating job directory. But we can run
34
+ # out of storage after creating the job directory and while running
35
+ # the job. This is a SystemError.
36
+ fail SystemError, 'Ran out of disk space.'
37
+ else
38
+ # I am not sure what the exit codes 3 means and we should not
39
+ # encounter exit code 5. The only other error that I know can happen
40
+ # but is not yet handled is when BLAST+ binaries break such as after
41
+ # macOS updates. So raise SystemError, include the exit status in the
42
+ # message, and say that that the "most likely" reason is broken BLAST+
43
+ # binaries.
44
+
45
+ error = File.read(stderr)
46
+ error = 'Most likely there is a problem with the BLAST+ binaries.' if error.empty?
47
+
48
+ fail SystemError, "BLAST failed abruptly (exit status: #{exitstatus}). #{error}"
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -1,5 +1,6 @@
1
1
  require 'sequenceserver/job'
2
2
  require 'sequenceserver/zip_file_generator'
3
+ require 'sequenceserver/blast/error'
3
4
 
4
5
  module SequenceServer
5
6
  # BLAST module.
@@ -56,46 +57,8 @@ module SequenceServer
56
57
  " -query '#{qfile}' #{options}"
57
58
  end
58
59
 
59
- # Override Job#raise! to raise specific API errors based on exitstatus
60
- # and using contents of stderr to provide context about the error.
61
- #
62
60
  def raise!
63
- # Return true exit status is 0 and stdout is not empty.
64
- return true if exitstatus.zero? && !File.zero?(stdout)
65
-
66
- # Handle error. See [1].
67
- case exitstatus
68
- when 1..2
69
- # 1: Error in query sequences or options.
70
- # 2: Error in BLAST databases.
71
- error = IO.foreach(stderr).grep(ERROR_LINE).join
72
- error = File.read(stderr) if error.empty?
73
- fail InputError, "(#{exitstatus}) #{error}"
74
- when 4
75
- # Out of memory. User can retry with a shorter search, so raising
76
- # InputError here instead of SystemError.
77
- fail InputError, <<~MSG
78
- Ran out of memory. Please try a smaller query, fewer and smaller
79
- databases, or limiting the output by using advanced options.
80
- MSG
81
- when 6
82
- # Error creating output files. It can't be a permission issue as that
83
- # would have been caught while creating job directory. But we can run
84
- # out of storage after creating the job directory and while running
85
- # the job. This is a SystemError.
86
- fail SystemError, 'Ran out of disk space.'
87
- else
88
- # I am not sure what the exit codes 3 means and we should not
89
- # encounter exit code 5. The only other error that I know can happen
90
- # but is not yet handled is when BLAST+ binaries break such as after
91
- # macOS updates. So raise SystemError, include the exit status in the
92
- # message, and say that that the "most likely" reason is broken BLAST+
93
- # binaries.
94
- fail SystemError, <<~MSG
95
- BLAST failed abruptly (exit status: #{exitstatus}). Most likely there is a
96
- problem with the BLAST+ binaries.
97
- MSG
98
- end
61
+ SequenceServer::BLAST::Error.new(exitstatus: exitstatus, stdout: stdout, stderr: stderr).raise!
99
62
  end
100
63
 
101
64
  # Use it with a block to get a self-cleaning temporary archive file
@@ -189,7 +152,3 @@ module SequenceServer
189
152
  end
190
153
  end
191
154
  end
192
-
193
- # References
194
- # ----------
195
- # [1]: http://www.ncbi.nlm.nih.gov/books/NBK1763/ (Appendices)
@@ -28,16 +28,25 @@ module SequenceServer
28
28
  enqueue(job)
29
29
  end
30
30
 
31
+ def serializable_classes
32
+ [
33
+ Time,
34
+ Symbol,
35
+ SequenceServer::Job,
36
+ SequenceServer::BLAST::Job,
37
+ SequenceServer::Database
38
+ ]
39
+ end
40
+
31
41
  # Fetches job with the given id.
32
42
  def fetch(id)
33
43
  job_file = File.join(DOTDIR, id, 'job.yaml')
34
- fail NotFound unless File.exist?(job_file)
35
- if RUBY_VERSION < '3.1.0'
36
- YAML.load_file(job_file)
37
- else
38
- YAML.load_file(job_file, permitted_classes: [Time, Symbol, SequenceServer::BLAST::Job, \
39
- SequenceServer::Database, SequenceServer::Job])
40
- end
44
+ return nil unless File.exist?(job_file)
45
+
46
+ YAML.safe_load_file(
47
+ job_file,
48
+ permitted_classes: serializable_classes
49
+ )
41
50
  end
42
51
 
43
52
  # Deletes job with the given id.
@@ -75,8 +84,8 @@ module SequenceServer
75
84
  # of job data will be held, yields (if block given) and saves the job.
76
85
  #
77
86
  # Subclasses should extend `initialize` as per requirement.
78
- def initialize(*)
79
- @id = SecureRandom.uuid
87
+ def initialize(params = {})
88
+ @id = params.fetch(:id, SecureRandom.uuid)
80
89
  @submitted_at = Time.now
81
90
  mkdir_p dir
82
91
  yield if block_given?
@@ -85,7 +94,7 @@ module SequenceServer
85
94
  raise SystemError, 'Not enough disk space to start a new job'
86
95
  rescue Errno::EACCES
87
96
  raise SystemError, "Permission denied to write to #{DOTDIR}"
88
- rescue => e
97
+ rescue StandardError => e
89
98
  rm_rf dir
90
99
  raise e
91
100
  end
@@ -117,7 +126,7 @@ module SequenceServer
117
126
  # should be called on a completed job before attempting to use the results.
118
127
  # Subclasses should provide their own implementation.
119
128
  def raise!
120
- raise if done? && exitstatus != 0
129
+ fail if done? && exitstatus != 0
121
130
  end
122
131
 
123
132
  # Where will the stdout be written to during execution and read from later.
@@ -163,6 +172,7 @@ module SequenceServer
163
172
  def fetch(key)
164
173
  filename = File.join(dir, key)
165
174
  fail unless File.exist? filename
175
+
166
176
  filename
167
177
  end
168
178
 
@@ -0,0 +1,345 @@
1
+ require 'find'
2
+ require 'forwardable'
3
+
4
+ module SequenceServer
5
+ # Smart `makeblastdb` wrapper: recursively scans database directory determining
6
+ # which files need to be formatted or re-formatted.
7
+ #
8
+ # Example usage:
9
+ #
10
+ # makeblastdb = MAKEBLASTDB.new(database_dir)
11
+ # makeblastdb.scan && makeblastdb.run
12
+ #
13
+ class MAKEBLASTDB
14
+ extend Forwardable
15
+
16
+ def_delegators SequenceServer, :config, :sys, :logger
17
+
18
+ def initialize(database_dir)
19
+ @database_dir = database_dir
20
+ end
21
+
22
+ attr_reader :database_dir
23
+ attr_reader :formatted_fastas
24
+ attr_reader :fastas_to_format
25
+ attr_reader :fastas_to_reformat
26
+
27
+ # Scans the database directory to determine which FASTA files require
28
+ # formatting or re-formatting.
29
+ #
30
+ # Returns `true` if there are files to (re-)format, `false` otherwise.
31
+ def scan
32
+ # We need to know the list of formatted FASTAs as reported by blastdbcmd
33
+ # first. This is required to determine both unformatted FASTAs and those
34
+ # that require reformatting.
35
+ @formatted_fastas = []
36
+ determine_formatted_fastas
37
+
38
+ # Now determine FASTA files that are unformatted or require reformatting.
39
+ @fastas_to_format = []
40
+ determine_unformatted_fastas
41
+ @fastas_to_reformat = []
42
+ determine_fastas_to_reformat
43
+
44
+ # Return true if there are files to be (re-)formatted or false otherwise.
45
+ !@fastas_to_format.empty? || !@fastas_to_reformat.empty?
46
+ end
47
+
48
+ # Returns true if at least one database in database directory is formatted.
49
+ def any_formatted?
50
+ !@formatted_fastas.empty?
51
+ end
52
+
53
+ # Returns true if there is at least one unformatted FASTA in the databases
54
+ # directory.
55
+ def any_unformatted?
56
+ !@fastas_to_format.empty?
57
+ end
58
+
59
+ # Returns true if the databases directory contains one or more incompatible
60
+ # databases.
61
+ #
62
+ # Note that it is okay to only use V4 databases or only V5 databases.
63
+ # Incompatibility arises when they are mixed.
64
+ def any_incompatible?
65
+ return false if @formatted_fastas.all? { |ff| ff.v4? || ff.alias? }
66
+ return false if @formatted_fastas.all? { |ff| ff.v5? || ff.alias? }
67
+ true
68
+ end
69
+
70
+ # Runs makeblastdb on each file in `@fastas_to_format` and
71
+ # `@fastas_to_reformat`. Will do nothing unless `#scan`
72
+ # has been run before.
73
+ def run
74
+ format
75
+ reformat
76
+ end
77
+
78
+ # Format any unformatted FASTA files in database directory. Returns Array
79
+ # of files that were formatted.
80
+ def format
81
+ # Make the intent clear as well as ensure the program won't crash if we
82
+ # accidentally call format before calling scan.
83
+ return unless @fastas_to_format
84
+ @fastas_to_format.select do |path, title, type|
85
+ make_blast_database('format', path, title, type)
86
+ end
87
+ end
88
+
89
+ # Re-format databases that require reformatting. Returns Array of files
90
+ # that were reformatted.
91
+ def reformat
92
+ # Make the intent clear as well as ensure the program won't crash if
93
+ # we accidentally call reformat before calling scan.
94
+ return unless @fastas_to_reformat
95
+ @fastas_to_reformat.select do |path, title, type, non_parse_seqids|
96
+ make_blast_database('reformat', path, title, type, non_parse_seqids)
97
+ end
98
+ end
99
+
100
+ private
101
+
102
+ # Determines which FASTA files in the database directory are already
103
+ # formatted. Adds to @formatted_fastas.
104
+ def determine_formatted_fastas
105
+ blastdbcmd.each_line do |line|
106
+ path, *rest = line.chomp.split("\t")
107
+ next if multipart_database_name?(path)
108
+ rest << get_categories(path)
109
+ @formatted_fastas << Database.new(path, *rest)
110
+ end
111
+ end
112
+
113
+ # Determines which FASTA files in the database directory require
114
+ # reformatting. Adds to @fastas_to_format.
115
+ def determine_fastas_to_reformat
116
+ @formatted_fastas.each do |ff|
117
+ if ff.v4? || ff.non_parse_seqids?
118
+ @fastas_to_reformat << [ff.path, ff.title, ff.type, ff.non_parse_seqids?]
119
+ end
120
+ end
121
+ end
122
+
123
+ # Determines which FASTA files in the database directory are
124
+ # unformatted. Adds to @fastas_to_format.
125
+ def determine_unformatted_fastas
126
+ # Add a trailing slash to database_dir - Find.find doesn't work as
127
+ # expected without the trailing slash if database_dir is a symlink
128
+ # inside a docker container.
129
+ Find.find(database_dir + '/') do |path|
130
+ next if File.directory?(path)
131
+ next unless probably_fasta?(path)
132
+ next if @formatted_fastas.any? { |f| f[0] == path }
133
+
134
+ @fastas_to_format << [path,
135
+ make_db_title(path),
136
+ guess_sequence_type_in_fasta(path)]
137
+ end
138
+ end
139
+
140
+ # Runs `blastdbcmd` to determine formatted FASTA files in the database
141
+ # directory. Returns the output of `blastdbcmd`. This method is called
142
+ # by `determine_formatted_fastas`.
143
+ def blastdbcmd
144
+ # calculate checksum of database directory
145
+ current_db_checksum = Zlib::crc32(Dir.glob(File.join(config[:database_dir], '/**/*')).map {
146
+ |path| path.to_s + "_" + File.mtime(path).to_s + "_" + File.size(path).to_s
147
+ }.to_s)
148
+
149
+ checksum_path = config[:database_dir].chomp('/') + '.checksum'
150
+ index_path = config[:database_dir].chomp('/') + '.index'
151
+
152
+ if File.exists?(checksum_path)
153
+ if current_db_checksum == File.read(checksum_path).to_i # db directory hasn't changed
154
+ if File.exists?(index_path) # lets use existing index
155
+ logger.info "Using existing database index: #{index_path}"
156
+ return File.read(index_path)
157
+ end
158
+ end
159
+ end rescue logger.error "Could not read: #{checksum_path} or #{index_path}"
160
+
161
+ # database directory has changed, or index file doesn't exist
162
+ # thus we run blastdbcmd to get formatted FASTA files
163
+ logger.info "Scanning for BLAST databases & creating index"
164
+ cmd = "blastdbcmd -recursive -list #{config[:database_dir]}" \
165
+ ' -list_outfmt "%f %t %p %n %l %d %v"'
166
+ out, err = sys(cmd, path: config[:bin])
167
+ errpat = /BLAST Database error/
168
+ fail BLAST_DATABASE_ERROR.new(cmd, err) if err.match(errpat)
169
+
170
+ # write checksum and index to file
171
+ File.open(checksum_path, 'w') { |f| f.write(current_db_checksum) } rescue
172
+ logger.error "Could not write database checksum to file" + checksum_path
173
+ File.open(index_path, 'w') { |f| f.write(out) } rescue
174
+ logger.error "Could not write database index to file" + index_path
175
+
176
+ return out
177
+ rescue CommandFailed => e
178
+ fail BLAST_DATABASE_ERROR.new(cmd, e.stderr)
179
+ end
180
+
181
+ # Create BLAST database, given FASTA file and sequence type in FASTA file.
182
+ def make_blast_database(action, file, title, type, non_parse_seqids = false)
183
+ return unless make_blast_database?(action, file, type)
184
+ title = confirm_database_title(title)
185
+ extract_fasta(file) unless File.exist?(file)
186
+ taxonomy = taxid_map(file, non_parse_seqids) || taxid
187
+ _make_blast_database(file, type, title, taxonomy)
188
+ end
189
+
190
+ # Show file path and guessed sequence type to the user and obtain a y/n
191
+ # response.
192
+ #
193
+ # Returns true if the user entered anything but 'n' or 'N'.
194
+ def make_blast_database?(action, file, type)
195
+ puts
196
+ puts
197
+ puts "FASTA file to #{action}: #{file}"
198
+ puts "FASTA type: #{type}"
199
+ print 'Proceed? [y/n] (Default: y): '
200
+ response = STDIN.gets.to_s.strip
201
+ !response.match(/n/i)
202
+ end
203
+
204
+ # Show the database title that we are going to use to the user for
205
+ # confirmation.
206
+ #
207
+ # Returns user input if any. Auto-determined title otherwise.
208
+ def confirm_database_title(default)
209
+ print "Enter a database title or will use '#{default}': "
210
+ from_user = STDIN.gets.to_s.strip
211
+ from_user.empty? && default || from_user
212
+ end
213
+
214
+ # Check if a '.taxid_map.txt' file exists. If not, try getting it
215
+ # using blastdbcmd.
216
+ def taxid_map(db, non_parse_seqids)
217
+ return if non_parse_seqids
218
+ taxid_map = db.sub(/#{File.extname(db)}$/, '.taxid_map.txt')
219
+ extract_taxid_map(db, taxid_map) if !File.exist?(taxid_map)
220
+ "-taxid_map #{taxid_map}" if !File.zero?(taxid_map)
221
+ end
222
+
223
+ # Get taxid from the user. Returns user input or 0.
224
+ #
225
+ # Using 0 as taxid is equivalent to not setting taxid for the database
226
+ # that will be created.
227
+ def taxid
228
+ default = 0
229
+ print 'Enter taxid (optional): '
230
+ user_response = STDIN.gets.strip
231
+ "-taxid #{user_response.empty? && default || Integer(user_response)}"
232
+ rescue ArgumentError # presumably from call to Interger()
233
+ puts 'taxid should be a number'
234
+ retry
235
+ end
236
+
237
+ def _make_blast_database(file, type, title, taxonomy)
238
+ cmd = "makeblastdb -parse_seqids -hash_index -in '#{file}'" \
239
+ " -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
240
+ " #{taxonomy}"
241
+ out, err = sys(cmd, path: config[:bin])
242
+ puts out.strip
243
+ puts err.strip
244
+ return true
245
+ rescue CommandFailed => e
246
+ puts <<~MSG
247
+ Could not create BLAST database for: #{file}
248
+ Tried: #{cmd}
249
+ stdout: #{e.stdout}
250
+ stderr: #{e.stderr}
251
+ MSG
252
+ exit!
253
+ end
254
+
255
+ # Extract FASTA file from BLAST database.
256
+ #
257
+ # Invoked while reformatting a BLAST database if the corresponding
258
+ # FASTA file does not exist.
259
+ def extract_fasta(db)
260
+ puts
261
+ puts 'Extracting sequences ...'
262
+ cmd = "blastdbcmd -entry all -db #{db}"
263
+ sys(cmd, stdout: db, path: config[:bin])
264
+ rescue CommandFailed => e
265
+ puts <<~MSG
266
+ Could not extract sequences from: #{db}
267
+ Tried: #{cmd}
268
+ stdout: #{e.stdout}
269
+ stderr: #{e.stderr}
270
+ MSG
271
+ exit!
272
+ end
273
+
274
+ def extract_taxid_map(db, taxmap_file)
275
+ cmd = "blastdbcmd -entry all -db #{db} -outfmt '%i %T'"
276
+ sys(cmd, stdout: taxmap_file, path: config[:bin])
277
+ rescue CommandFailed => e
278
+ # silence error
279
+ end
280
+
281
+ # Returns true if the database name appears to be a multi-part database
282
+ # name.
283
+ #
284
+ # e.g.
285
+ # /home/ben/pd.ben/sequenceserver/db/nr.00 => yes
286
+ # /home/ben/pd.ben/sequenceserver/db/nr => no
287
+ # /home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01 => yes
288
+ # /home/ben/pd.ben/sequenceserver/db/nr00 => no
289
+ # /mnt/blast-db/refseq_genomic.100 => yes
290
+ def multipart_database_name?(db_name)
291
+ !(db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?)
292
+ end
293
+
294
+ def get_categories(path)
295
+ path
296
+ .gsub(config[:database_dir], '') # remove database_dir from path
297
+ .split('/')
298
+ .reject(&:empty?)[0..-2] # the first entry might be '' if database_dir does not end with /
299
+ end
300
+
301
+ # Returns true if first character of the file is '>'.
302
+ def probably_fasta?(file)
303
+ return false unless file.match(/((cds)|(fasta)|(fna)|(pep)|(cdna)|(fa)|(prot)|(fas)|(genome)|(nuc)|(dna)|(nt))$/i)
304
+ File.read(file, 1) == '>'
305
+ end
306
+
307
+ # Suggests improved titles when generating database names from files
308
+ # for improved apperance and readability in web interface.
309
+ # For example:
310
+ # Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
311
+ # S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
312
+ def make_db_title(path)
313
+ db_name = File.basename(path)
314
+ db_name.tr!('"', "'")
315
+ # removes .fasta like extension names
316
+ db_name.gsub!(File.extname(db_name), '')
317
+ # replaces _ with ' ',
318
+ db_name.gsub!(/(_)/, ' ')
319
+ # replaces '.' with ' ' when no numbers are on either side,
320
+ db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
321
+ # preserves version numbers
322
+ db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
323
+ db_name
324
+ end
325
+
326
+ # Guess whether FASTA file contains protein or nucleotide sequences by
327
+ # sampling a few few characters of the file.
328
+ def guess_sequence_type_in_fasta(file)
329
+ sequences = sample_sequences(file)
330
+ sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
331
+ sequence_types = sequence_types.uniq.compact
332
+ (sequence_types.length == 1) && sequence_types.first
333
+ end
334
+
335
+ # Read first 1,048,576 characters of the file, split the read text on
336
+ # fasta def line pattern and return.
337
+ #
338
+ # If the given file is FASTA, returns Array of as many different
339
+ # sequences in the portion of the file read. Returns the portion
340
+ # of the file read wrapped in an Array otherwise.
341
+ def sample_sequences(file)
342
+ File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
343
+ end
344
+ end
345
+ end
@@ -30,6 +30,10 @@ module SequenceServer
30
30
  any_to_format? || any_to_reformat?
31
31
  end
32
32
 
33
+ def no_fastas?
34
+ probably_fastas.empty?
35
+ end
36
+
33
37
  # Runs makeblastdb on each file in `fastas_to_format` and
34
38
  # `fastas_to_reformat`.
35
39
  def run
@@ -79,12 +83,12 @@ module SequenceServer
79
83
  @formatted_fastas
80
84
  end
81
85
 
82
- private
83
-
84
86
  def any_to_format?
85
87
  fastas_to_format.any?
86
88
  end
87
89
 
90
+ private
91
+
88
92
  def any_to_reformat?
89
93
  fastas_to_reformat.any?
90
94
  end
@@ -107,22 +111,32 @@ module SequenceServer
107
111
  def fastas_to_format
108
112
  return @fastas_to_format if defined?(@fastas_to_format)
109
113
 
110
- @fastas_to_format = []
114
+ formatted_fasta_paths = formatted_fastas.map { |f| f[0] }
115
+ fasta_paths_to_format = probably_fastas - formatted_fasta_paths
116
+
117
+ @fastas_to_format = fasta_paths_to_format.map do |path|
118
+ [
119
+ path,
120
+ make_db_title(path),
121
+ guess_sequence_type_in_fasta(path)
122
+ ]
123
+ end
124
+
125
+ @fastas_to_format
126
+ end
127
+
128
+ def probably_fastas
129
+ return @probably_fastas if defined?(@probably_fastas)
130
+
131
+ @probably_fastas = []
111
132
 
112
- # Add a trailing slash to database_dir - Find.find doesn't work as
113
- # expected without the trailing slash if database_dir is a symlink
114
- # inside a docker container.
115
133
  Find.find(database_dir + '/') do |path|
116
134
  next if File.directory?(path)
117
- next unless probably_fasta?(path)
118
- next if formatted_fastas.any? { |f| f[0] == path }
119
135
 
120
- @fastas_to_format << [path,
121
- make_db_title(path),
122
- guess_sequence_type_in_fasta(path)]
136
+ @probably_fastas << path if probably_fasta?(path)
123
137
  end
124
138
 
125
- @fastas_to_format
139
+ @probably_fastas
126
140
  end
127
141
 
128
142
  # Runs `blastdbcmd` to determine formatted FASTA files in the database