sequenceserver 2.0.0.rc4 → 2.0.0.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (83) hide show
  1. checksums.yaml +4 -4
  2. data/bin/sequenceserver +10 -27
  3. data/lib/sequenceserver.rb +6 -0
  4. data/lib/sequenceserver/blast/job.rb +11 -1
  5. data/lib/sequenceserver/database.rb +0 -134
  6. data/lib/sequenceserver/makeblastdb.rb +243 -0
  7. data/lib/sequenceserver/version.rb +1 -1
  8. data/public/js/hit.js +1 -1
  9. data/public/js/search.js +4 -6
  10. data/public/sequenceserver-report.min.js +1 -1
  11. data/public/sequenceserver-search.min.js +1 -1
  12. data/spec/capybara_spec.rb +11 -0
  13. data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.ndb +0 -0
  14. data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhr +0 -0
  15. data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nin +0 -0
  16. data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nos +0 -0
  17. data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.not +0 -0
  18. data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.ntf +0 -0
  19. data/spec/database/sample/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nto +0 -0
  20. data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pdb +0 -0
  21. data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
  22. data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
  23. data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pos +0 -0
  24. data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pot +0 -0
  25. data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.ptf +0 -0
  26. data/spec/database/sample/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pto +0 -0
  27. data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pdb +0 -0
  28. data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phr +0 -0
  29. data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pin +0 -0
  30. data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pos +0 -0
  31. data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pot +0 -0
  32. data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.ptf +0 -0
  33. data/spec/database/sample/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pto +0 -0
  34. data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.ndb +0 -0
  35. data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
  36. data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
  37. data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nos +0 -0
  38. data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.not +0 -0
  39. data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
  40. data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.ntf +0 -0
  41. data/spec/database/sample/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nto +0 -0
  42. data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhd +8 -0
  43. data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhi +0 -0
  44. data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nhr +0 -0
  45. data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nin +0 -0
  46. data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nog +0 -0
  47. data/spec/database/{sample → v4}/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsd +0 -0
  48. data/spec/database/{sample → v4}/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsi +0 -0
  49. data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.fasta.nsq +0 -0
  50. data/spec/database/v4/genome/Solenopsis_invicta/Solenopsis_invicta_gnG_subset.txt +8 -0
  51. data/spec/database/v4/links.rb +23 -0
  52. data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta +6449 -0
  53. data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phd +1189 -0
  54. data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phi +0 -0
  55. data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
  56. data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
  57. data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.pog +0 -0
  58. data/spec/database/{sample → v4}/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psd +0 -0
  59. data/spec/database/{sample → v4}/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psi +0 -0
  60. data/spec/database/v4/proteins/Solenopsis_invicta/Sinvicta2-2-3.prot.subset.fasta.psq +0 -0
  61. data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phd +9140 -0
  62. data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phi +0 -0
  63. data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.phr +0 -0
  64. data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pin +0 -0
  65. data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.pog +0 -0
  66. data/spec/database/{sample → v4}/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psd +0 -0
  67. data/spec/database/{sample → v4}/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psi +0 -0
  68. data/spec/database/v4/proteins/uniprot/2018-04-Swiss-Prot_insecta.fasta.psq +0 -0
  69. data/spec/database/v4/proteins/uniprot/URL +1 -0
  70. data/spec/database/v4/si_uniprot_idmap.yml +14180 -0
  71. data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta +5486 -0
  72. data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhd +473 -0
  73. data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhi +0 -0
  74. data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
  75. data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
  76. data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nog +0 -0
  77. data/spec/database/{sample → v4}/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsd +0 -0
  78. data/spec/database/{sample → v4}/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsi +0 -0
  79. data/spec/database/v4/transcripts/Solenopsis_invicta/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
  80. data/spec/database_spec.rb +0 -76
  81. data/spec/makeblastdb_spec.rb +121 -0
  82. data/views/layout.erb +4 -0
  83. metadata +66 -13
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 56e26f480b73ad075d19001178316bce0bf7a5b26eba33a36dfb17db8ad8e525
4
- data.tar.gz: 9b9e8f88a1941181a8e71e89d93e07f4b2a47d07cfb44e6691333e56eb31537a
3
+ metadata.gz: b04c2f43509bf7d4a15d555e3bd9cea86d3c7770c7a3e35016f963f877055ecf
4
+ data.tar.gz: 25678397eacb2c3f445902624baff80a90d2a47ca98d2b703e13de3eca84cf1e
5
5
  SHA512:
6
- metadata.gz: 07b11c9ed4f89c81d49d73a0a10fde2a89326dab10cefc8794ed13bc165831d234393cf36cdc58c1552ce06bde2057e468e6e17fbaa438c4a31d0036c65591b5
7
- data.tar.gz: 406996da9f95d6c9c2463448d9f4ff72dd33fb70d73bf2e1bbb926c17c641d37d89e9f9c18156ac31d9fc50d5141338ed70e522fc46d8e35903dde9b015e79f6
6
+ metadata.gz: a8e6a4c1b4917d9bd35d07ef8e580e1b75105bfd1709fb892d9ed523da91f8c3a08d50c1c9da304530cd0bb5d77da00a4944c65cd5ce90fc979c9d6099149058
7
+ data.tar.gz: ef493ec4cf1aef63b2f535251143b34154d0d332a47a1d15668c7a74b139d5e6c2a2f9c33b5c7072ae9c2fbbcf33048fad4e828273c85a199a6ab79874d6e766
@@ -68,10 +68,6 @@ begin
68
68
  # of threads to use in config file.
69
69
  $ sequenceserver -s -n 16
70
70
 
71
- # See if you have FASTA files in database dir that haven't
72
- # been converted into BLAST database.
73
- $ sequenceserver -u
74
-
75
71
  # Search for FASTA files in database dir that haven't been
76
72
  # converted into BLAST database yet, and convert them.
77
73
  $ sequenceserver -m
@@ -135,9 +131,6 @@ begin
135
131
  on 'l', 'list_databases',
136
132
  'List BLAST databases'
137
133
 
138
- on 'u', 'list-unformatted-fastas',
139
- 'List unformatted FASTA files'
140
-
141
134
  on 'i', 'interactive',
142
135
  'Run SequenceServer in interactive mode'
143
136
 
@@ -285,8 +278,7 @@ begin
285
278
  fetch_option(:database_dir).value = response
286
279
  redo
287
280
  rescue SequenceServer::NO_BLAST_DATABASE_FOUND => e
288
- unless list_databases? || list_unformatted_fastas? ||
289
- make_blast_databases?
281
+ unless list_databases? || make_blast_databases?
290
282
 
291
283
  # Print error raised.
292
284
  puts
@@ -305,13 +297,13 @@ begin
305
297
  unless response =~ /^[n]$/i
306
298
  puts
307
299
  puts 'Searching ...'
308
- if SequenceServer::Database.unformatted_fastas.empty?
309
- puts "Couldn't find any FASTA files."
310
- exit!
311
- else
312
- formatted = SequenceServer::Database.make_blast_databases
300
+ if SequenceServer.makeblastdb.scan
301
+ formatted = SequenceServer.makeblastdb.run
313
302
  exit! if formatted.empty? && !set?
314
303
  redo unless set?
304
+ else
305
+ puts "Couldn't find any FASTA files."
306
+ exit!
315
307
  end
316
308
  else
317
309
  exit! unless set?
@@ -361,22 +353,13 @@ begin
361
353
  exit
362
354
  end
363
355
 
364
- if list_unformatted_fastas? || make_blast_databases?
365
- unformatted_fastas = SequenceServer::Database.unformatted_fastas
366
- if unformatted_fastas.empty?
356
+ if make_blast_databases?
357
+ if SequenceServer.makeblastdb.scan
358
+ SequenceServer.makeblastdb.run
359
+ else
367
360
  puts "All FASTA files in #{SequenceServer.config[:database_dir]} " \
368
361
  'are formatted.'
369
- exit
370
362
  end
371
- end
372
-
373
- if list_unformatted_fastas?
374
- puts unformatted_fastas
375
- exit
376
- end
377
-
378
- if make_blast_databases?
379
- SequenceServer::Database.make_blast_databases
380
363
  exit
381
364
  end
382
365
 
@@ -24,6 +24,7 @@ module SequenceServer
24
24
  require 'sequenceserver/config'
25
25
  require 'sequenceserver/server'
26
26
  require 'sequenceserver/routes'
27
+ require 'sequenceserver/makeblastdb'
27
28
  require 'sequenceserver/job_remover'
28
29
  require 'sequenceserver/exceptions'
29
30
  require 'sequenceserver/sys'
@@ -57,6 +58,11 @@ module SequenceServer
57
58
  end
58
59
  end
59
60
 
61
+ # MAKEBLASTDB service object.
62
+ def makeblastdb
63
+ @makeblastdb ||= MAKEBLASTDB.new(config[:database_dir])
64
+ end
65
+
60
66
  # SequenceServer initialisation routine.
61
67
  def init(config = {})
62
68
  # Use default config file if caller didn't specify one.
@@ -65,6 +65,16 @@ module SequenceServer
65
65
  error = IO.foreach(stderr).grep(ERROR_LINE).join
66
66
  error = File.read(stderr) if error.empty?
67
67
  fail InputError, error
68
+ when 2
69
+ fail InputError, <<~MSG
70
+ BLAST signalled a problem with the databases that you searched.
71
+
72
+ Most likely one or more of your databases were created using an
73
+ older version of BLAST. Please consider recreating the databases
74
+ using BLAST #{BLAST_VERSION}.
75
+
76
+ As a temporary solution, you can try searching one database at a time.
77
+ MSG
68
78
  when 4
69
79
  # Out of memory. User can retry with a shorter search, so raising
70
80
  # InputError here instead of SystemError.
@@ -79,7 +89,7 @@ module SequenceServer
79
89
  # the job. This is a SystemError.
80
90
  fail SystemError, 'Ran out of disk space.'
81
91
  else
82
- # I am not sure what the exit codes 2 & 3 means and we should note
92
+ # I am not sure what the exit codes 3 means and we should not
83
93
  # encounter exit code 5. The only other error that I know can happen
84
94
  # but is not yet handled is when BLAST+ binaries break such as after
85
95
  # macOS updates. So raise SystemError, include the exit status in the
@@ -1,4 +1,3 @@
1
- require 'find'
2
1
  require 'open3'
3
2
  require 'digest/md5'
4
3
  require 'forwardable'
@@ -209,97 +208,6 @@ module SequenceServer
209
208
  end
210
209
  # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
211
210
 
212
- # Recursively scan `database_dir` for un-formatted FASTA and format them
213
- # for use with BLAST+.
214
- def make_blast_databases
215
- unformatted_fastas.select do |file, sequence_type|
216
- make_blast_database(file, sequence_type)
217
- end
218
- end
219
-
220
- # Returns an Array of FASTA files that may require formatting, and the
221
- # type of sequence contained in each FASTA.
222
- #
223
- # > unformatted_fastas
224
- # => [['/foo/bar.fasta', :nulceotide], ...]
225
- def unformatted_fastas
226
- list = []
227
- database_dir = config[:database_dir]
228
- Find.find database_dir do |file|
229
- next if File.directory? file
230
- next if Database.include? file
231
- next unless probably_fasta? file
232
- sequence_type = guess_sequence_type_in_fasta file
233
- if %i[protein nucleotide].include?(sequence_type)
234
- list << [file, sequence_type]
235
- end
236
- end
237
- list
238
- end
239
-
240
- # Create BLAST database, given FASTA file and sequence type in FASTA file.
241
- def make_blast_database(file, type)
242
- return unless make_blast_database? file, type
243
- title = get_database_title(file)
244
- taxid = fetch_tax_id
245
- _make_blast_database(file, type, title, taxid)
246
- end
247
-
248
- def _make_blast_database(file, type, title, taxid, quiet = false)
249
- cmd = 'makeblastdb -parse_seqids -hash_index ' \
250
- "-in #{file} -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
251
- " -taxid #{taxid}"
252
- out, err = sys(cmd, path: config[:bin])
253
- puts out, err unless quiet
254
- rescue CommandFailed => e
255
- puts <<~MSG
256
- Could not create BLAST database for: #{file}
257
- Tried: #{cmd}
258
- stdout: #{e.stdout}
259
- stderr: #{e.stderr}
260
- MSG
261
- exit!
262
- end
263
-
264
- # Show file path and guessed sequence type to the user and obtain a y/n
265
- # response.
266
- #
267
- # Returns true if the user entered anything but 'n' or 'N'.
268
- def make_blast_database?(file, type)
269
- puts
270
- puts
271
- puts "FASTA file: #{file}"
272
- puts "FASTA type: #{type}"
273
- print 'Proceed? [y/n] (Default: y): '
274
- response = STDIN.gets.to_s.strip
275
- !response.match(/n/i)
276
- end
277
-
278
- # Generate a title for the given database and show it to the user for
279
- # confirmation.
280
- #
281
- # Returns user input if any. Auto-generated title otherwise.
282
- def get_database_title(path)
283
- default = make_db_title(File.basename(path))
284
- print "Enter a database title or will use '#{default}': "
285
- from_user = STDIN.gets.to_s.strip
286
- from_user.empty? && default || from_user
287
- end
288
-
289
- # Get taxid from the user. Returns user input or 0.
290
- #
291
- # Using 0 as taxid is equivalent to not setting taxid for the database
292
- # that will be created.
293
- def fetch_tax_id
294
- default = 0
295
- print 'Enter taxid (optional): '
296
- user_response = STDIN.gets.strip
297
- user_response.empty? && default || Integer(user_response)
298
- rescue
299
- puts 'taxid should be a number'
300
- retry
301
- end
302
-
303
211
  # Returns true if the database name appears to be a multi-part database
304
212
  # name.
305
213
  #
@@ -312,48 +220,6 @@ module SequenceServer
312
220
  def multipart_database_name?(db_name)
313
221
  !(db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?)
314
222
  end
315
-
316
- # Returns true if first character of the file is '>'.
317
- def probably_fasta?(file)
318
- File.read(file, 1) == '>'
319
- end
320
-
321
- # Suggests improved titles when generating database names from files
322
- # for improved apperance and readability in web interface.
323
- # For example:
324
- # Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
325
- # S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
326
- def make_db_title(db_name)
327
- db_name.tr!('"', "'")
328
- # removes .fasta like extension names
329
- db_name.gsub!(File.extname(db_name), '')
330
- # replaces _ with ' ',
331
- db_name.gsub!(/(_)/, ' ')
332
- # replaces '.' with ' ' when no numbers are on either side,
333
- db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
334
- # preserves version numbers
335
- db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
336
- db_name
337
- end
338
-
339
- # Guess whether FASTA file contains protein or nucleotide sequences by
340
- # sampling a few few characters of the file.
341
- def guess_sequence_type_in_fasta(file)
342
- sequences = sample_sequences(file)
343
- sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
344
- sequence_types = sequence_types.uniq.compact
345
- (sequence_types.length == 1) && sequence_types.first
346
- end
347
-
348
- # Read first 1,048,576 characters of the file, split the read text on
349
- # fasta def line pattern and return.
350
- #
351
- # If the given file is FASTA, returns Array of as many different
352
- # sequences in the portion of the file read. Returns the portion
353
- # of the file read wrapped in an Array otherwise.
354
- def sample_sequences(file)
355
- File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
356
- end
357
223
  end
358
224
  end
359
225
  end
@@ -0,0 +1,243 @@
1
+ require 'find'
2
+ require 'forwardable'
3
+
4
+ module SequenceServer
5
+ # Smart `makeblastdb` wrapper: recursively scans database directory determining
6
+ # which files need to be formatted or re-formatted.
7
+ #
8
+ # Example usage:
9
+ #
10
+ # makeblastdb = MAKEBLASTDB.new(database_dir)
11
+ # makeblastdb.scan && makeblastdb.run
12
+ #
13
+ class MAKEBLASTDB
14
+ # We want V5 databases created using -parse_seqids for proper function of
15
+ # SequenceServer. This means each database should be comprised of at least 9
16
+ # files with the following extensions. Databases created by us will have two
17
+ # additional files with the extensions nhd and nhi, or phd and phi, due to
18
+ # the use of -hash_index option. Finally, multipart databases will have one
19
+ # additional file with the extension nal or pal.
20
+ REQUIRED_EXTENSIONS = {
21
+ 'nucleotide' => %w{ndb nhr nin nog nos not nsq ntf nto}.freeze,
22
+ 'protein' => %w{pdb phr pin pog pos pot psq ptf pto}.freeze
23
+ }
24
+
25
+ extend Forwardable
26
+
27
+ def_delegators SequenceServer, :config, :sys
28
+
29
+ def initialize(database_dir)
30
+ @database_dir = database_dir
31
+ end
32
+
33
+ attr_reader :database_dir
34
+
35
+ # Scans the database directory to determine which FASTA files require
36
+ # formatting or re-formatting.
37
+ #
38
+ # Returns `true` if there are files to (re-)format, `false` otherwise.
39
+ def scan
40
+ # We need to know the list of formatted FASTAs as reported by blastdbcmd
41
+ # first. This is required to determine both unformatted FASTAs and those
42
+ # that require reformatting.
43
+ @formatted_fastas = []
44
+ determine_formatted_fastas
45
+
46
+ # Now determine FASTA files that are unformatted or require reformatting.
47
+ @fastas_to_format = []
48
+ determine_unformatted_fastas
49
+ determine_fastas_to_reformat
50
+
51
+ # Return true if there are files to be (re-)formatted or false otherwise.
52
+ !@fastas_to_format.empty?
53
+ end
54
+
55
+ # Runs makeblastdb on each file in `@fastas_to_format`. Will do nothing
56
+ # unless `#scan` has been run before.
57
+ def run
58
+ return unless @fastas_to_format || @fastas_to_format.empty?
59
+ @fastas_to_format.each do |path, title, type|
60
+ make_blast_database(path, title, type)
61
+ end
62
+ end
63
+
64
+ private
65
+
66
+ # Determines which FASTA files in the database directory are already
67
+ # formatted. Adds to @formatted_fastas.
68
+ def determine_formatted_fastas
69
+ blastdbcmd.each_line do |line|
70
+ path, title, type = line.split(' ')
71
+ next if multipart_database_name?(path)
72
+ @formatted_fastas << [path, title, type.strip.downcase]
73
+ end
74
+ end
75
+
76
+ # Determines which FASTA files in the database directory require
77
+ # reformatting. Adds to @fastas_to_format.
78
+ def determine_fastas_to_reformat
79
+ @formatted_fastas.each do |path, title, type|
80
+ required_extensions = REQUIRED_EXTENSIONS[type]
81
+ exts = Dir["#{path}.*"].map { |p| p.split('.').last }.sort
82
+ next if (exts & required_extensions) == required_extensions
83
+
84
+ @fastas_to_format << [path, title, type]
85
+ end
86
+ end
87
+
88
+ # Determines which FASTA files in the database directory are
89
+ # unformatted. Adds to @fastas_to_format.
90
+ def determine_unformatted_fastas
91
+ Find.find(database_dir) do |path|
92
+ next if File.directory?(path)
93
+ next unless probably_fasta?(path)
94
+ next if @formatted_fastas.any? { |f| f[0] == path }
95
+
96
+ @fastas_to_format << [path,
97
+ make_db_title(File.basename(path)),
98
+ guess_sequence_type_in_fasta(path)]
99
+ end
100
+ end
101
+
102
+ # Runs `blastdbcmd` to determine formatted FASTA files in the database
103
+ # directory. Returns the output of `blastdbcmd`. This method is called
104
+ # by `determine_formatted_fastas`.
105
+ def blastdbcmd
106
+ cmd = "blastdbcmd -recursive -list #{database_dir}" \
107
+ ' -list_outfmt "%f %t %p"'
108
+ out, _ = sys(cmd, path: config[:bin])
109
+ out
110
+ end
111
+
112
+ # Create BLAST database, given FASTA file and sequence type in FASTA file.
113
+ def make_blast_database(file, title, type)
114
+ return unless make_blast_database? file, type
115
+ title = confirm_database_title(title)
116
+ taxid = fetch_tax_id
117
+ _make_blast_database(file, type, title, taxid)
118
+ end
119
+
120
+ def _make_blast_database(file, type, title, taxid)
121
+ extract_fasta(file) unless File.exist?(file)
122
+ cmd = "makeblastdb -parse_seqids -hash_index -in #{file} " \
123
+ "-dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
124
+ " -taxid #{taxid}"
125
+ out, err = sys(cmd, path: config[:bin])
126
+ puts out.strip
127
+ puts err.strip
128
+ rescue CommandFailed => e
129
+ puts <<~MSG
130
+ Could not create BLAST database for: #{file}
131
+ Tried: #{cmd}
132
+ stdout: #{e.stdout}
133
+ stderr: #{e.stderr}
134
+ MSG
135
+ exit!
136
+ end
137
+
138
+ # Show file path and guessed sequence type to the user and obtain a y/n
139
+ # response.
140
+ #
141
+ # Returns true if the user entered anything but 'n' or 'N'.
142
+ def make_blast_database?(file, type)
143
+ puts
144
+ puts
145
+ puts "FASTA file to format/reformat: #{file}"
146
+ puts "FASTA type: #{type}"
147
+ print 'Proceed? [y/n] (Default: y): '
148
+ response = STDIN.gets.to_s.strip
149
+ !response.match(/n/i)
150
+ end
151
+
152
+ # Show the database title that we are going to use to the user for
153
+ # confirmation.
154
+ #
155
+ # Returns user input if any. Auto-determined title otherwise.
156
+ def confirm_database_title(default)
157
+ print "Enter a database title or will use '#{default}': "
158
+ from_user = STDIN.gets.to_s.strip
159
+ from_user.empty? && default || from_user
160
+ end
161
+
162
+ # Get taxid from the user. Returns user input or 0.
163
+ #
164
+ # Using 0 as taxid is equivalent to not setting taxid for the database
165
+ # that will be created.
166
+ def fetch_tax_id
167
+ default = 0
168
+ print 'Enter taxid (optional): '
169
+ user_response = STDIN.gets.strip
170
+ user_response.empty? && default || Integer(user_response)
171
+ rescue
172
+ puts 'taxid should be a number'
173
+ retry
174
+ end
175
+
176
+ # Extract FASTA file from BLAST database.
177
+ #
178
+ # Invoked while reformatting a BLAST database if the corresponding
179
+ # FASTA file does not exist.
180
+ def extract_fasta(db)
181
+ puts
182
+ puts 'Extracting sequences ...'
183
+ cmd = "blastdbcmd -entry all -db #{db}"
184
+ sys(cmd, stdout: db, path: config[:bin])
185
+ rescue CommandFailed => e
186
+ puts <<~MSG
187
+ Could not extract sequences from: #{db}
188
+ Tried: #{cmd}
189
+ stdout: #{e.stdout}
190
+ stderr: #{e.stderr}
191
+ MSG
192
+ exit!
193
+ end
194
+
195
+ # Returns true if the database name appears to be a multi-part database
196
+ # name.
197
+ def multipart_database_name?(db_name)
198
+ Database.multipart_database_name? db_name
199
+ end
200
+
201
+ # Returns true if first character of the file is '>'.
202
+ def probably_fasta?(file)
203
+ File.read(file, 1) == '>'
204
+ end
205
+
206
+ # Suggests improved titles when generating database names from files
207
+ # for improved apperance and readability in web interface.
208
+ # For example:
209
+ # Cobs1.4.proteins.fasta -> Cobs 1.4 proteins
210
+ # S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
211
+ def make_db_title(db_name)
212
+ db_name.tr!('"', "'")
213
+ # removes .fasta like extension names
214
+ db_name.gsub!(File.extname(db_name), '')
215
+ # replaces _ with ' ',
216
+ db_name.gsub!(/(_)/, ' ')
217
+ # replaces '.' with ' ' when no numbers are on either side,
218
+ db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
219
+ # preserves version numbers
220
+ db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
221
+ db_name
222
+ end
223
+
224
+ # Guess whether FASTA file contains protein or nucleotide sequences by
225
+ # sampling a few few characters of the file.
226
+ def guess_sequence_type_in_fasta(file)
227
+ sequences = sample_sequences(file)
228
+ sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
229
+ sequence_types = sequence_types.uniq.compact
230
+ (sequence_types.length == 1) && sequence_types.first
231
+ end
232
+
233
+ # Read first 1,048,576 characters of the file, split the read text on
234
+ # fasta def line pattern and return.
235
+ #
236
+ # If the given file is FASTA, returns Array of as many different
237
+ # sequences in the portion of the file read. Returns the portion
238
+ # of the file read wrapped in an Array otherwise.
239
+ def sample_sequences(file)
240
+ File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
241
+ end
242
+ end
243
+ end