sequenceserver 2.2.0 → 3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/COPYRIGHT.txt +1 -1
  3. data/bin/sequenceserver +4 -2
  4. data/lib/sequenceserver/blast/error.rb +53 -0
  5. data/lib/sequenceserver/blast/job.rb +2 -43
  6. data/lib/sequenceserver/job.rb +21 -11
  7. data/lib/sequenceserver/makeblastdb-modified-with-cache.rb +345 -0
  8. data/lib/sequenceserver/makeblastdb.rb +26 -12
  9. data/lib/sequenceserver/routes.rb +29 -3
  10. data/lib/sequenceserver/server.rb +1 -1
  11. data/lib/sequenceserver/version.rb +1 -1
  12. data/lib/sequenceserver.rb +3 -0
  13. data/public/404.html +27 -0
  14. data/public/config.js +0 -6
  15. data/public/css/grapher.css +1 -1
  16. data/public/css/sequenceserver.css +22 -11
  17. data/public/css/sequenceserver.min.css +2 -2
  18. data/public/js/circos.js +7 -3
  19. data/public/js/dnd.js +3 -3
  20. data/public/js/fastq_to_fasta.js +35 -0
  21. data/public/js/form.js +30 -11
  22. data/public/js/grapher.js +123 -113
  23. data/public/js/hit.js +8 -2
  24. data/public/js/hits_overview.js +4 -1
  25. data/public/js/jquery_world.js +0 -1
  26. data/public/js/kablammo.js +4 -0
  27. data/public/js/length_distribution.js +5 -1
  28. data/public/js/null_plugins/download_links.js +7 -0
  29. data/public/js/null_plugins/hit_buttons.js +11 -0
  30. data/public/js/null_plugins/report_plugins.js +18 -0
  31. data/public/js/query.js +26 -6
  32. data/public/js/report.js +33 -17
  33. data/public/js/search.js +0 -8
  34. data/public/js/sidebar.js +11 -1
  35. data/public/js/tests/mock_data/sequences.js +18 -1
  36. data/public/js/tests/search_query.spec.js +12 -3
  37. data/public/sequenceserver-report.min.js +76 -42
  38. data/public/sequenceserver-search.min.js +34 -33
  39. data/views/layout.erb +9 -12
  40. metadata +32 -23
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 61b15f95cdc065739de70f2ba94d3022e6beec789dd9c64199ee1be3a2459c63
4
- data.tar.gz: d6c108bd39cb8c6a832787d2cbaa52b2c5a3653c0b08b289cb149b7221a9870c
3
+ metadata.gz: 68dcb3dda53edae5d095423c164568484e786b927fcf733a4722eafde7c3a155
4
+ data.tar.gz: afb985fe5e762b8a1ab9e1dc4e8dd0bc6158f601a3cc95c0029f03a8587c1fa2
5
5
  SHA512:
6
- metadata.gz: d4c2b1785980dd4e46a24a8a5a6d36652fb23f5389f9b7320cd9360f081cbad82b6dc6bfe735b5321738611094052631af0de635b77676587d703d466bd5f604
7
- data.tar.gz: 235e35e8ba4331894a7ac9f107d950462d3b3d892ddafc6f58d3ba48fbe6a03cfc016ac64798921472e296a0cb5577245aca49cbb517d897094a501ec4483beb
6
+ metadata.gz: fb122a17858e6b7c567a418bda05358ae1a9b34ef1dfdcc11bb5f83097b47e427d7344e61e9f9102488f59f6448592b7d66e1e67ee213498e750dfea5e2e7c8f
7
+ data.tar.gz: b7b7504faf439acda18aafa0866e74f3645e431b4252bdb12ffd4896d7cf6ee065756db9e1bba027184794f306ce3b732c4911dce56d8b4a96e734dcf86c336c
data/COPYRIGHT.txt CHANGED
@@ -3,7 +3,7 @@ SequenceServer is copyright Anurag Priyam, Ben J Woodcroft and Yannick Wurm,
3
3
  version 3, the text of which can be found in LICENSE.txt.
4
4
 
5
5
  Components of SequenceServer, including Sinatra, Ox, Slop, html5shiv,
6
- Underscore, jQuery, jQuery UI, Bootstrap and Webshim, are licensed under
6
+ Underscore, jQuery, jQuery UI and Bootstrap, are licensed under
7
7
  the MIT license. D3.js is licensed under BSD license. biojs-vis-sequence
8
8
  is licensed under Apache license. Thin and JSON are licensed under Ruby
9
9
  license. All unmodified files from these and other sources retain their
data/bin/sequenceserver CHANGED
@@ -358,7 +358,9 @@ begin
358
358
  end
359
359
 
360
360
  if make_blast_databases?
361
- if SequenceServer.makeblastdb.any_to_format_or_reformat?
361
+ if SequenceServer.makeblastdb.no_fastas?
362
+ puts "Couldn't find any FASTA files in #{SequenceServer.config[:database_dir]}."
363
+ elsif SequenceServer.makeblastdb.any_to_format_or_reformat?
362
364
  puts
363
365
  puts <<~MSG
364
366
  SequenceServer has scanned your databases directory and will now offer
@@ -380,7 +382,7 @@ begin
380
382
  print '>> '
381
383
  response = STDIN.gets.to_s.strip
382
384
  SequenceServer.makeblastdb.run unless response =~ /^[n]$/i
383
- else
385
+ else
384
386
  puts "All FASTA files in #{SequenceServer.config[:database_dir]} " \
385
387
  'are formatted.'
386
388
  end
@@ -0,0 +1,53 @@
1
+ # http://www.ncbi.nlm.nih.gov/books/NBK1763/ (Appendices)
2
+
3
+ module SequenceServer
4
+ module BLAST
5
+ class Error
6
+ attr_reader :exitstatus, :stdout, :stderr
7
+
8
+ def initialize(exitstatus:, stdout:, stderr:)
9
+ @exitstatus = exitstatus
10
+ @stdout = stdout
11
+ @stderr = stderr
12
+ end
13
+
14
+ def raise!
15
+ return true if exitstatus.zero? && !File.zero?(stdout)
16
+
17
+ case exitstatus
18
+ when 1..2
19
+ # 1: Error in query sequences or options.
20
+ # 2: Error in BLAST databases.
21
+ error = IO.foreach(stderr).grep(ERROR_LINE).join
22
+ error = File.read(stderr) if error.empty?
23
+ fail InputError, "(#{exitstatus}) #{error}"
24
+ when 4
25
+ # Out of memory. User can retry with a shorter search, so raising
26
+ # InputError here instead of SystemError.
27
+ fail InputError, <<~MSG
28
+ Ran out of memory. Please try a smaller query, fewer and smaller
29
+ databases, or limiting the output by using advanced options.
30
+ MSG
31
+ when 6
32
+ # Error creating output files. It can't be a permission issue as that
33
+ # would have been caught while creating job directory. But we can run
34
+ # out of storage after creating the job directory and while running
35
+ # the job. This is a SystemError.
36
+ fail SystemError, 'Ran out of disk space.'
37
+ else
38
+ # I am not sure what the exit codes 3 means and we should not
39
+ # encounter exit code 5. The only other error that I know can happen
40
+ # but is not yet handled is when BLAST+ binaries break such as after
41
+ # macOS updates. So raise SystemError, include the exit status in the
42
+ # message, and say that that the "most likely" reason is broken BLAST+
43
+ # binaries.
44
+
45
+ error = File.read(stderr)
46
+ error = 'Most likely there is a problem with the BLAST+ binaries.' if error.empty?
47
+
48
+ fail SystemError, "BLAST failed abruptly (exit status: #{exitstatus}). #{error}"
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -1,5 +1,6 @@
1
1
  require 'sequenceserver/job'
2
2
  require 'sequenceserver/zip_file_generator'
3
+ require 'sequenceserver/blast/error'
3
4
 
4
5
  module SequenceServer
5
6
  # BLAST module.
@@ -56,46 +57,8 @@ module SequenceServer
56
57
  " -query '#{qfile}' #{options}"
57
58
  end
58
59
 
59
- # Override Job#raise! to raise specific API errors based on exitstatus
60
- # and using contents of stderr to provide context about the error.
61
- #
62
60
  def raise!
63
- # Return true exit status is 0 and stdout is not empty.
64
- return true if exitstatus.zero? && !File.zero?(stdout)
65
-
66
- # Handle error. See [1].
67
- case exitstatus
68
- when 1..2
69
- # 1: Error in query sequences or options.
70
- # 2: Error in BLAST databases.
71
- error = IO.foreach(stderr).grep(ERROR_LINE).join
72
- error = File.read(stderr) if error.empty?
73
- fail InputError, "(#{exitstatus}) #{error}"
74
- when 4
75
- # Out of memory. User can retry with a shorter search, so raising
76
- # InputError here instead of SystemError.
77
- fail InputError, <<~MSG
78
- Ran out of memory. Please try a smaller query, fewer and smaller
79
- databases, or limiting the output by using advanced options.
80
- MSG
81
- when 6
82
- # Error creating output files. It can't be a permission issue as that
83
- # would have been caught while creating job directory. But we can run
84
- # out of storage after creating the job directory and while running
85
- # the job. This is a SystemError.
86
- fail SystemError, 'Ran out of disk space.'
87
- else
88
- # I am not sure what the exit codes 3 means and we should not
89
- # encounter exit code 5. The only other error that I know can happen
90
- # but is not yet handled is when BLAST+ binaries break such as after
91
- # macOS updates. So raise SystemError, include the exit status in the
92
- # message, and say that that the "most likely" reason is broken BLAST+
93
- # binaries.
94
- fail SystemError, <<~MSG
95
- BLAST failed abruptly (exit status: #{exitstatus}). Most likely there is a
96
- problem with the BLAST+ binaries.
97
- MSG
98
- end
61
+ SequenceServer::BLAST::Error.new(exitstatus: exitstatus, stdout: stdout, stderr: stderr).raise!
99
62
  end
100
63
 
101
64
  # Use it with a block to get a self-cleaning temporary archive file
@@ -189,7 +152,3 @@ module SequenceServer
189
152
  end
190
153
  end
191
154
  end
192
-
193
- # References
194
- # ----------
195
- # [1]: http://www.ncbi.nlm.nih.gov/books/NBK1763/ (Appendices)
@@ -28,16 +28,25 @@ module SequenceServer
28
28
  enqueue(job)
29
29
  end
30
30
 
31
+ def serializable_classes
32
+ [
33
+ Time,
34
+ Symbol,
35
+ SequenceServer::Job,
36
+ SequenceServer::BLAST::Job,
37
+ SequenceServer::Database
38
+ ]
39
+ end
40
+
31
41
  # Fetches job with the given id.
32
42
  def fetch(id)
33
43
  job_file = File.join(DOTDIR, id, 'job.yaml')
34
- fail NotFound unless File.exist?(job_file)
35
- if RUBY_VERSION < '3.1.0'
36
- YAML.load_file(job_file)
37
- else
38
- YAML.load_file(job_file, permitted_classes: [Time, Symbol, SequenceServer::BLAST::Job, \
39
- SequenceServer::Database, SequenceServer::Job])
40
- end
44
+ return nil unless File.exist?(job_file)
45
+
46
+ YAML.safe_load_file(
47
+ job_file,
48
+ permitted_classes: serializable_classes
49
+ )
41
50
  end
42
51
 
43
52
  # Deletes job with the given id.
@@ -75,8 +84,8 @@ module SequenceServer
75
84
  # of job data will be held, yields (if block given) and saves the job.
76
85
  #
77
86
  # Subclasses should extend `initialize` as per requirement.
78
- def initialize(*)
79
- @id = SecureRandom.uuid
87
+ def initialize(params = {})
88
+ @id = params.fetch(:id, SecureRandom.uuid)
80
89
  @submitted_at = Time.now
81
90
  mkdir_p dir
82
91
  yield if block_given?
@@ -85,7 +94,7 @@ module SequenceServer
85
94
  raise SystemError, 'Not enough disk space to start a new job'
86
95
  rescue Errno::EACCES
87
96
  raise SystemError, "Permission denied to write to #{DOTDIR}"
88
- rescue => e
97
+ rescue StandardError => e
89
98
  rm_rf dir
90
99
  raise e
91
100
  end
@@ -117,7 +126,7 @@ module SequenceServer
117
126
  # should be called on a completed job before attempting to use the results.
118
127
  # Subclasses should provide their own implementation.
119
128
  def raise!
120
- raise if done? && exitstatus != 0
129
+ fail if done? && exitstatus != 0
121
130
  end
122
131
 
123
132
  # Where will the stdout be written to during execution and read from later.
@@ -163,6 +172,7 @@ module SequenceServer
163
172
  def fetch(key)
164
173
  filename = File.join(dir, key)
165
174
  fail unless File.exist? filename
175
+
166
176
  filename
167
177
  end
168
178
 
@@ -0,0 +1,345 @@
1
+ require 'find'
2
+ require 'forwardable'
3
+
4
+ module SequenceServer
5
+ # Smart `makeblastdb` wrapper: recursively scans database directory determining
6
+ # which files need to be formatted or re-formatted.
7
+ #
8
+ # Example usage:
9
+ #
10
+ # makeblastdb = MAKEBLASTDB.new(database_dir)
11
+ # makeblastdb.scan && makeblastdb.run
12
+ #
13
+ class MAKEBLASTDB
14
+ extend Forwardable
15
+
16
+ def_delegators SequenceServer, :config, :sys, :logger
17
+
18
+ def initialize(database_dir)
19
+ @database_dir = database_dir
20
+ end
21
+
22
+ attr_reader :database_dir
23
+ attr_reader :formatted_fastas
24
+ attr_reader :fastas_to_format
25
+ attr_reader :fastas_to_reformat
26
+
27
+ # Scans the database directory to determine which FASTA files require
28
+ # formatting or re-formatting.
29
+ #
30
+ # Returns `true` if there are files to (re-)format, `false` otherwise.
31
+ def scan
32
+ # We need to know the list of formatted FASTAs as reported by blastdbcmd
33
+ # first. This is required to determine both unformatted FASTAs and those
34
+ # that require reformatting.
35
+ @formatted_fastas = []
36
+ determine_formatted_fastas
37
+
38
+ # Now determine FASTA files that are unformatted or require reformatting.
39
+ @fastas_to_format = []
40
+ determine_unformatted_fastas
41
+ @fastas_to_reformat = []
42
+ determine_fastas_to_reformat
43
+
44
+ # Return true if there are files to be (re-)formatted or false otherwise.
45
+ !@fastas_to_format.empty? || !@fastas_to_reformat.empty?
46
+ end
47
+
48
+ # Returns true if at least one database in database directory is formatted.
49
+ def any_formatted?
50
+ !@formatted_fastas.empty?
51
+ end
52
+
53
+ # Returns true if there is at least one unformatted FASTA in the databases
54
+ # directory.
55
+ def any_unformatted?
56
+ !@fastas_to_format.empty?
57
+ end
58
+
59
+ # Returns true if the databases directory contains one or more incompatible
60
+ # databases.
61
+ #
62
+ # Note that it is okay to only use V4 databases or only V5 databases.
63
+ # Incompatibility arises when they are mixed.
64
+ def any_incompatible?
65
+ return false if @formatted_fastas.all? { |ff| ff.v4? || ff.alias? }
66
+ return false if @formatted_fastas.all? { |ff| ff.v5? || ff.alias? }
67
+ true
68
+ end
69
+
70
+ # Runs makeblastdb on each file in `@fastas_to_format` and
71
+ # `@fastas_to_reformat`. Will do nothing unless `#scan`
72
+ # has been run before.
73
+ def run
74
+ format
75
+ reformat
76
+ end
77
+
78
+ # Format any unformatted FASTA files in database directory. Returns Array
79
+ # of files that were formatted.
80
+ def format
81
+ # Make the intent clear as well as ensure the program won't crash if we
82
+ # accidentally call format before calling scan.
83
+ return unless @fastas_to_format
84
+ @fastas_to_format.select do |path, title, type|
85
+ make_blast_database('format', path, title, type)
86
+ end
87
+ end
88
+
89
+ # Re-format databases that require reformatting. Returns Array of files
90
+ # that were reformatted.
91
+ def reformat
92
+ # Make the intent clear as well as ensure the program won't crash if
93
+ # we accidentally call reformat before calling scan.
94
+ return unless @fastas_to_reformat
95
+ @fastas_to_reformat.select do |path, title, type, non_parse_seqids|
96
+ make_blast_database('reformat', path, title, type, non_parse_seqids)
97
+ end
98
+ end
99
+
100
+ private
101
+
102
+ # Determines which FASTA files in the database directory are already
103
+ # formatted. Adds to @formatted_fastas.
104
+ def determine_formatted_fastas
105
+ blastdbcmd.each_line do |line|
106
+ path, *rest = line.chomp.split("\t")
107
+ next if multipart_database_name?(path)
108
+ rest << get_categories(path)
109
+ @formatted_fastas << Database.new(path, *rest)
110
+ end
111
+ end
112
+
113
+ # Determines which FASTA files in the database directory require
114
+ # reformatting. Adds to @fastas_to_format.
115
+ def determine_fastas_to_reformat
116
+ @formatted_fastas.each do |ff|
117
+ if ff.v4? || ff.non_parse_seqids?
118
+ @fastas_to_reformat << [ff.path, ff.title, ff.type, ff.non_parse_seqids?]
119
+ end
120
+ end
121
+ end
122
+
123
+ # Determines which FASTA files in the database directory are
124
+ # unformatted. Adds to @fastas_to_format.
125
+ def determine_unformatted_fastas
126
+ # Add a trailing slash to database_dir - Find.find doesn't work as
127
+ # expected without the trailing slash if database_dir is a symlink
128
+ # inside a docker container.
129
+ Find.find(database_dir + '/') do |path|
130
+ next if File.directory?(path)
131
+ next unless probably_fasta?(path)
132
+ next if @formatted_fastas.any? { |f| f[0] == path }
133
+
134
+ @fastas_to_format << [path,
135
+ make_db_title(path),
136
+ guess_sequence_type_in_fasta(path)]
137
+ end
138
+ end
139
+
140
+ # Runs `blastdbcmd` to determine formatted FASTA files in the database
141
+ # directory. Returns the output of `blastdbcmd`. This method is called
142
+ # by `determine_formatted_fastas`.
143
+ def blastdbcmd
144
+ # calculate checksum of database directory
145
+ current_db_checksum = Zlib::crc32(Dir.glob(File.join(config[:database_dir], '/**/*')).map {
146
+ |path| path.to_s + "_" + File.mtime(path).to_s + "_" + File.size(path).to_s
147
+ }.to_s)
148
+
149
+ checksum_path = config[:database_dir].chomp('/') + '.checksum'
150
+ index_path = config[:database_dir].chomp('/') + '.index'
151
+
152
+ if File.exists?(checksum_path)
153
+ if current_db_checksum == File.read(checksum_path).to_i # db directory hasn't changed
154
+ if File.exists?(index_path) # lets use existing index
155
+ logger.info "Using existing database index: #{index_path}"
156
+ return File.read(index_path)
157
+ end
158
+ end
159
+ end rescue logger.error "Could not read: #{checksum_path} or #{index_path}"
160
+
161
+ # database directory has changed, or index file doesn't exist
162
+ # thus we run blastdbcmd to get formatted FASTA files
163
+ logger.info "Scanning for BLAST databases & creating index"
164
+ cmd = "blastdbcmd -recursive -list #{config[:database_dir]}" \
165
+ ' -list_outfmt "%f %t %p %n %l %d %v"'
166
+ out, err = sys(cmd, path: config[:bin])
167
+ errpat = /BLAST Database error/
168
+ fail BLAST_DATABASE_ERROR.new(cmd, err) if err.match(errpat)
169
+
170
+ # write checksum and index to file
171
+ File.open(checksum_path, 'w') { |f| f.write(current_db_checksum) } rescue
172
+ logger.error "Could not write database checksum to file" + checksum_path
173
+ File.open(index_path, 'w') { |f| f.write(out) } rescue
174
+ logger.error "Could not write database index to file" + index_path
175
+
176
+ return out
177
+ rescue CommandFailed => e
178
+ fail BLAST_DATABASE_ERROR.new(cmd, e.stderr)
179
+ end
180
+
181
+ # Create BLAST database, given FASTA file and sequence type in FASTA file.
182
+ def make_blast_database(action, file, title, type, non_parse_seqids = false)
183
+ return unless make_blast_database?(action, file, type)
184
+ title = confirm_database_title(title)
185
+ extract_fasta(file) unless File.exist?(file)
186
+ taxonomy = taxid_map(file, non_parse_seqids) || taxid
187
+ _make_blast_database(file, type, title, taxonomy)
188
+ end
189
+
190
+ # Show file path and guessed sequence type to the user and obtain a y/n
191
+ # response.
192
+ #
193
+ # Returns true if the user entered anything but 'n' or 'N'.
194
+ def make_blast_database?(action, file, type)
195
+ puts
196
+ puts
197
+ puts "FASTA file to #{action}: #{file}"
198
+ puts "FASTA type: #{type}"
199
+ print 'Proceed? [y/n] (Default: y): '
200
+ response = STDIN.gets.to_s.strip
201
+ !response.match(/n/i)
202
+ end
203
+
204
+ # Show the database title that we are going to use to the user for
205
+ # confirmation.
206
+ #
207
+ # Returns user input if any. Auto-determined title otherwise.
208
+ def confirm_database_title(default)
209
+ print "Enter a database title or will use '#{default}': "
210
+ from_user = STDIN.gets.to_s.strip
211
+ from_user.empty? && default || from_user
212
+ end
213
+
214
+ # Check if a '.taxid_map.txt' file exists. If not, try getting it
215
+ # using blastdbcmd.
216
+ def taxid_map(db, non_parse_seqids)
217
+ return if non_parse_seqids
218
+ taxid_map = db.sub(/#{File.extname(db)}$/, '.taxid_map.txt')
219
+ extract_taxid_map(db, taxid_map) if !File.exist?(taxid_map)
220
+ "-taxid_map #{taxid_map}" if !File.zero?(taxid_map)
221
+ end
222
+
223
+ # Get taxid from the user. Returns user input or 0.
224
+ #
225
+ # Using 0 as taxid is equivalent to not setting taxid for the database
226
+ # that will be created.
227
+ def taxid
228
+ default = 0
229
+ print 'Enter taxid (optional): '
230
+ user_response = STDIN.gets.strip
231
+ "-taxid #{user_response.empty? && default || Integer(user_response)}"
232
+ rescue ArgumentError # presumably from call to Interger()
233
+ puts 'taxid should be a number'
234
+ retry
235
+ end
236
+
237
+ def _make_blast_database(file, type, title, taxonomy)
238
+ cmd = "makeblastdb -parse_seqids -hash_index -in '#{file}'" \
239
+ " -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
240
+ " #{taxonomy}"
241
+ out, err = sys(cmd, path: config[:bin])
242
+ puts out.strip
243
+ puts err.strip
244
+ return true
245
+ rescue CommandFailed => e
246
+ puts <<~MSG
247
+ Could not create BLAST database for: #{file}
248
+ Tried: #{cmd}
249
+ stdout: #{e.stdout}
250
+ stderr: #{e.stderr}
251
+ MSG
252
+ exit!
253
+ end
254
+
255
+ # Extract FASTA file from BLAST database.
256
+ #
257
+ # Invoked while reformatting a BLAST database if the corresponding
258
+ # FASTA file does not exist.
259
+ def extract_fasta(db)
260
+ puts
261
+ puts 'Extracting sequences ...'
262
+ cmd = "blastdbcmd -entry all -db #{db}"
263
+ sys(cmd, stdout: db, path: config[:bin])
264
+ rescue CommandFailed => e
265
+ puts <<~MSG
266
+ Could not extract sequences from: #{db}
267
+ Tried: #{cmd}
268
+ stdout: #{e.stdout}
269
+ stderr: #{e.stderr}
270
+ MSG
271
+ exit!
272
+ end
273
+
274
+ def extract_taxid_map(db, taxmap_file)
275
+ cmd = "blastdbcmd -entry all -db #{db} -outfmt '%i %T'"
276
+ sys(cmd, stdout: taxmap_file, path: config[:bin])
277
+ rescue CommandFailed => e
278
+ # silence error
279
+ end
280
+
281
+ # Returns true if the database name appears to be a multi-part database
282
+ # name.
283
+ #
284
+ # e.g.
285
+ # /home/ben/pd.ben/sequenceserver/db/nr.00 => yes
286
+ # /home/ben/pd.ben/sequenceserver/db/nr => no
287
+ # /home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01 => yes
288
+ # /home/ben/pd.ben/sequenceserver/db/nr00 => no
289
+ # /mnt/blast-db/refseq_genomic.100 => yes
290
+ def multipart_database_name?(db_name)
291
+ !(db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?)
292
+ end
293
+
294
+ def get_categories(path)
295
+ path
296
+ .gsub(config[:database_dir], '') # remove database_dir from path
297
+ .split('/')
298
+ .reject(&:empty?)[0..-2] # the first entry might be '' if database_dir does not end with /
299
+ end
300
+
301
+ # Returns true if first character of the file is '>'.
302
+ def probably_fasta?(file)
303
+ return false unless file.match(/((cds)|(fasta)|(fna)|(pep)|(cdna)|(fa)|(prot)|(fas)|(genome)|(nuc)|(dna)|(nt))$/i)
304
+ File.read(file, 1) == '>'
305
+ end
306
+
307
+ # Suggests improved titles when generating database names from files
308
+ # for improved apperance and readability in web interface.
309
+ # For example:
310
+ # Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
311
+ # S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
312
+ def make_db_title(path)
313
+ db_name = File.basename(path)
314
+ db_name.tr!('"', "'")
315
+ # removes .fasta like extension names
316
+ db_name.gsub!(File.extname(db_name), '')
317
+ # replaces _ with ' ',
318
+ db_name.gsub!(/(_)/, ' ')
319
+ # replaces '.' with ' ' when no numbers are on either side,
320
+ db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
321
+ # preserves version numbers
322
+ db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
323
+ db_name
324
+ end
325
+
326
+ # Guess whether FASTA file contains protein or nucleotide sequences by
327
+ # sampling a few few characters of the file.
328
+ def guess_sequence_type_in_fasta(file)
329
+ sequences = sample_sequences(file)
330
+ sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
331
+ sequence_types = sequence_types.uniq.compact
332
+ (sequence_types.length == 1) && sequence_types.first
333
+ end
334
+
335
+ # Read first 1,048,576 characters of the file, split the read text on
336
+ # fasta def line pattern and return.
337
+ #
338
+ # If the given file is FASTA, returns Array of as many different
339
+ # sequences in the portion of the file read. Returns the portion
340
+ # of the file read wrapped in an Array otherwise.
341
+ def sample_sequences(file)
342
+ File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
343
+ end
344
+ end
345
+ end
@@ -30,6 +30,10 @@ module SequenceServer
30
30
  any_to_format? || any_to_reformat?
31
31
  end
32
32
 
33
+ def no_fastas?
34
+ probably_fastas.empty?
35
+ end
36
+
33
37
  # Runs makeblastdb on each file in `fastas_to_format` and
34
38
  # `fastas_to_reformat`.
35
39
  def run
@@ -79,12 +83,12 @@ module SequenceServer
79
83
  @formatted_fastas
80
84
  end
81
85
 
82
- private
83
-
84
86
  def any_to_format?
85
87
  fastas_to_format.any?
86
88
  end
87
89
 
90
+ private
91
+
88
92
  def any_to_reformat?
89
93
  fastas_to_reformat.any?
90
94
  end
@@ -107,22 +111,32 @@ module SequenceServer
107
111
  def fastas_to_format
108
112
  return @fastas_to_format if defined?(@fastas_to_format)
109
113
 
110
- @fastas_to_format = []
114
+ formatted_fasta_paths = formatted_fastas.map { |f| f[0] }
115
+ fasta_paths_to_format = probably_fastas - formatted_fasta_paths
116
+
117
+ @fastas_to_format = fasta_paths_to_format.map do |path|
118
+ [
119
+ path,
120
+ make_db_title(path),
121
+ guess_sequence_type_in_fasta(path)
122
+ ]
123
+ end
124
+
125
+ @fastas_to_format
126
+ end
127
+
128
+ def probably_fastas
129
+ return @probably_fastas if defined?(@probably_fastas)
130
+
131
+ @probably_fastas = []
111
132
 
112
- # Add a trailing slash to database_dir - Find.find doesn't work as
113
- # expected without the trailing slash if database_dir is a symlink
114
- # inside a docker container.
115
133
  Find.find(database_dir + '/') do |path|
116
134
  next if File.directory?(path)
117
- next unless probably_fasta?(path)
118
- next if formatted_fastas.any? { |f| f[0] == path }
119
135
 
120
- @fastas_to_format << [path,
121
- make_db_title(path),
122
- guess_sequence_type_in_fasta(path)]
136
+ @probably_fastas << path if probably_fasta?(path)
123
137
  end
124
138
 
125
- @fastas_to_format
139
+ @probably_fastas
126
140
  end
127
141
 
128
142
  # Runs `blastdbcmd` to determine formatted FASTA files in the database