sequenceserver 0.6.7
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -0
- data/LICENSE.txt +64 -0
- data/README.txt +5 -0
- data/bin/database_formatter +195 -0
- data/bin/sequenceserver +12 -0
- data/config.ru +5 -0
- data/example.config.yml +39 -0
- data/lib/sequenceserver/blast.rb +211 -0
- data/lib/sequenceserver/customisation.rb +60 -0
- data/lib/sequenceserver/database.rb +23 -0
- data/lib/sequenceserver/helpers.rb +127 -0
- data/lib/sequenceserver/sequencehelpers.rb +119 -0
- data/lib/sequenceserver/sinatralikeloggerformatter.rb +12 -0
- data/lib/sequenceserver.rb +525 -0
- data/public/blastResult.js +99 -0
- data/public/css/bootstrap.min.css +330 -0
- data/public/css/custom.css +92 -0
- data/public/js/jquery.enablePlaceholder.min.js +11 -0
- data/public/js/jquery.js +18 -0
- data/public/js/search.js +148 -0
- data/sequenceserver.gemspec +42 -0
- data/tests/database/nucleotide/Sinvicta2-2-3.cdna.subset.fasta +5486 -0
- data/tests/database/nucleotide/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
- data/tests/database/nucleotide/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
- data/tests/database/nucleotide/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
- data/tests/database/protein/Sinvicta2-2-3.prot.subset.fasta +6449 -0
- data/tests/database/protein/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
- data/tests/database/protein/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
- data/tests/database/protein/Sinvicta2-2-3.prot.subset.fasta.psq +0 -0
- data/tests/test_sequencehelpers.rb +85 -0
- data/views/500.erb +22 -0
- data/views/search.erb +221 -0
- metadata +141 -0
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'sequenceserver/database'
|
2
|
+
|
3
|
+
module SequenceServer
|
4
|
+
module Helpers
|
5
|
+
module SystemHelpers
|
6
|
+
# Scan the given directory for blast executables. Passing `nil` scans the
|
7
|
+
# system `PATH`.
|
8
|
+
# ---
|
9
|
+
# Arguments:
|
10
|
+
# * bin(String) - absolute path to the directory containing blast binaries
|
11
|
+
# ---
|
12
|
+
# Returns:
|
13
|
+
# * a hash of blast methods, and their corresponding absolute path
|
14
|
+
# ---
|
15
|
+
# Raises:
|
16
|
+
# * IOError - if the executables can't be found
|
17
|
+
#
|
18
|
+
# > scan_blast_executables('/home/yeban/bin')
|
19
|
+
# => { "blastx"=>"/home/yeban/bin/blastx",
|
20
|
+
# "blastn"=>"/home/yeban/bin/blastn",
|
21
|
+
# ...
|
22
|
+
# }
|
23
|
+
def scan_blast_executables(bin)
|
24
|
+
if bin and not File.directory?(bin)
|
25
|
+
raise IOError, "Could not find '#{bin}' defined in config.yml."
|
26
|
+
end
|
27
|
+
|
28
|
+
binaries = {}
|
29
|
+
%w|blastn blastp blastx tblastn tblastx blastdbcmd makeblastdb blast_formatter|.each do |method|
|
30
|
+
path = File.join(bin, method) rescue method
|
31
|
+
if command?(path)
|
32
|
+
binaries[method] = path
|
33
|
+
else
|
34
|
+
blasturl = 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download'
|
35
|
+
raise IOError, "Could not find blast binaries. You may need to
|
36
|
+
install BLAST+ from #{blasturl}. And/or point config.yml to blast's
|
37
|
+
bin directory."
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#LOG.info("Config bin dir: #{bin}")
|
42
|
+
binaries
|
43
|
+
end
|
44
|
+
|
45
|
+
# Scan the given directory (including subdirectory) for blast databases.
|
46
|
+
# ---
|
47
|
+
# Arguments:
|
48
|
+
# * db_root(String) - absolute path to the blast databases
|
49
|
+
# ---
|
50
|
+
# Returns:
|
51
|
+
# * a hash of sorted blast databases grouped by database type:
|
52
|
+
# protein, or nucleotide
|
53
|
+
# ---
|
54
|
+
# Raises:
|
55
|
+
# * IOError - if no database can be found
|
56
|
+
#
|
57
|
+
# > scan_blast_db('/home/yeban/blast_db')
|
58
|
+
# => { "protein" => [], "nucleotide" => [] }
|
59
|
+
def scan_blast_db(db_root, blastdbcmd = 'blastdbcmd')
|
60
|
+
raise IOError, "Database directory doesn't exist: #{db_root}" unless File.directory?( db_root )
|
61
|
+
|
62
|
+
find_dbs_command = %|#{blastdbcmd} -recursive -list #{db_root} -list_outfmt "%p %f %t" 2>&1|
|
63
|
+
|
64
|
+
begin
|
65
|
+
db_list = %x|#{find_dbs_command}|
|
66
|
+
if db_list.empty?
|
67
|
+
raise IOError, "No formatted blast databases found in '#{ db_root }'."
|
68
|
+
end
|
69
|
+
rescue => e
|
70
|
+
puts '', e.to_s
|
71
|
+
|
72
|
+
print "Do you want to format your blast databases now? [Y/n]: "
|
73
|
+
choice = gets.chomp[0,1].downcase
|
74
|
+
|
75
|
+
unless choice == 'n'
|
76
|
+
database_formatter = File.join(settings.root, 'database_formatter.rb')
|
77
|
+
system("#{database_formatter} #{db_root}")
|
78
|
+
retry
|
79
|
+
else
|
80
|
+
raise # let the caller decide what to do if database discovery fails
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if db_list.match(/BLAST Database error/)
|
85
|
+
raise IOError, "Error parsing blast databases.\n" + "Tried: '#{find_dbs_command}'\n"+
|
86
|
+
"It crashed with the following error: '#{db_list}'\n" +
|
87
|
+
"Try reformatting databases using makeblastdb.\n"
|
88
|
+
end
|
89
|
+
|
90
|
+
db = {}
|
91
|
+
|
92
|
+
db_list.each_line do |line|
|
93
|
+
next if line.empty? # required for BLAST+ 2.2.22
|
94
|
+
type, name, *title = line.split(' ')
|
95
|
+
type = type.downcase
|
96
|
+
name = name.freeze
|
97
|
+
title = title.join(' ').freeze
|
98
|
+
#LOG.info("Found #{type} database: #{title} at #{name}")
|
99
|
+
(db[type] ||= []) << Database.new(name, title)
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
# the erb would fail as calling nil.each_with_index if a dbtype was undefined.
|
104
|
+
db['protein'] = [] unless db.keys.include?('protein')
|
105
|
+
db['nucleotide'] = [] unless db.keys.include?('nucleotide')
|
106
|
+
|
107
|
+
# sort the list of dbs
|
108
|
+
db['protein'].sort!
|
109
|
+
db['nucleotide'].sort!
|
110
|
+
|
111
|
+
db
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
# check if the given command exists and is executable
|
117
|
+
# returns True if all is good.
|
118
|
+
def command?(command)
|
119
|
+
system("which #{command} > /dev/null 2>&1")
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def self.included(klass)
|
124
|
+
klass.extend SystemHelpers
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
module SequenceServer
|
2
|
+
# Module to collect some sequence-related helper functions
|
3
|
+
module SequenceHelpers
|
4
|
+
|
5
|
+
# copied from bioruby's Bio::Sequence
|
6
|
+
# returns a Hash. Eg: composition("asdfasdfffffasdf")
|
7
|
+
# => {"a"=>3, "d"=>3, "f"=>7, "s"=>3}
|
8
|
+
def composition(sequence_string)
|
9
|
+
count = Hash.new(0)
|
10
|
+
sequence_string.scan(/./) do |x|
|
11
|
+
count[x] += 1
|
12
|
+
end
|
13
|
+
return count
|
14
|
+
end
|
15
|
+
|
16
|
+
# Strips all non-letter characters. guestimates sequence based on that.
|
17
|
+
# If less than 10 useable characters... returns nil
|
18
|
+
# If more than 90% ACGTU returns :nucleotide. else returns :protein
|
19
|
+
def guess_sequence_type(sequence_string)
|
20
|
+
cleaned_sequence = sequence_string.gsub(/[^A-Z]/i, '') # removing non-letter characters
|
21
|
+
cleaned_sequence.gsub!(/[NX]/i, '') # removing ambiguous characters
|
22
|
+
|
23
|
+
return nil if cleaned_sequence.length < 10 # conservative
|
24
|
+
|
25
|
+
composition = composition(cleaned_sequence)
|
26
|
+
composition_NAs = composition.select { |character, count|character.match(/[ACGTU]/i) } # only putative NAs
|
27
|
+
putative_NA_counts = composition_NAs.collect { |key_value_array| key_value_array[1] } # only count, not char
|
28
|
+
putative_NA_sum = putative_NA_counts.inject { |sum, n| sum + n } # count of all putative NA
|
29
|
+
putative_NA_sum = 0 if putative_NA_sum.nil?
|
30
|
+
|
31
|
+
if putative_NA_sum > (0.9 * cleaned_sequence.length)
|
32
|
+
return :nucleotide
|
33
|
+
else
|
34
|
+
return :protein
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# splits input at putative fasta definition lines (like ">adsfadsf"), guesses sequence type for each sequence.
|
39
|
+
# if not enough sequence to determine, returns nil.
|
40
|
+
# if 2 kinds of sequence mixed together, raises ArgumentError
|
41
|
+
# otherwise, returns :nucleotide or :protein
|
42
|
+
def type_of_sequences(fasta_format_string)
|
43
|
+
# the first sequence does not need to have a fasta definition line
|
44
|
+
sequences = fasta_format_string.split(/^>.*$/).delete_if { |seq| seq.empty? }
|
45
|
+
|
46
|
+
# get all sequence types
|
47
|
+
sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }.uniq.compact
|
48
|
+
|
49
|
+
return nil if sequence_types.empty?
|
50
|
+
|
51
|
+
if sequence_types.length == 1
|
52
|
+
return sequence_types.first # there is only one (but yes its an array)
|
53
|
+
else
|
54
|
+
raise ArgumentError, "Insufficient info to determine sequence type. Cleaned queries are: #{ sequences.to_s }"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Return the database type that can be used for a given blast method.
|
59
|
+
# db_type_for("blastp") => :protein
|
60
|
+
# db_type_for("tblastn") => :nucleotide
|
61
|
+
# db_type_for(nil) => nil
|
62
|
+
def db_type_for(blast_method)
|
63
|
+
case blast_method
|
64
|
+
when 'blastp', 'blastx'
|
65
|
+
:protein
|
66
|
+
when 'blastn', 'tblastx', 'tblastn'
|
67
|
+
:nucleotide
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Return the blast methods that can be used for a given type of sequence.
|
72
|
+
# blast_methods_for(:protein) => ['blastp', 'tblastn']
|
73
|
+
# blast_methods_for(:nucleotide) => ['blastn','tblastx','blastx']
|
74
|
+
# blast_methods_for(nil) => ['blastp', 'tblastn','blastn','tblastx','blastx']
|
75
|
+
def blast_methods_for(seq_type)
|
76
|
+
case seq_type
|
77
|
+
when :protein
|
78
|
+
['blastp', 'tblastn']
|
79
|
+
when :nucleotide
|
80
|
+
['blastn','tblastx','blastx']
|
81
|
+
else # Sequence type not predicted, so don't make any assumptions about the blast method
|
82
|
+
['blastp', 'tblastn','blastn','tblastx','blastx']
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def sequence_from_blastdb(ids, db) # helpful when displaying parsed blast results
|
87
|
+
# we know how to handle an Array of ids
|
88
|
+
ids = ids.join(',') if ids.is_a? Array
|
89
|
+
|
90
|
+
# we don't know what to do if the arguments ain't String
|
91
|
+
raise TypeError unless ids.is_a? String and db.is_a? String
|
92
|
+
|
93
|
+
# query now!
|
94
|
+
#
|
95
|
+
# If `blastdbcmd` throws error, we assume sequence not found.
|
96
|
+
blastdbcmd = settings.binaries['blastdbcmd']
|
97
|
+
command = %x|#{blastdbcmd} -db #{db} -entry #{ids} 2> /dev/null|
|
98
|
+
end
|
99
|
+
|
100
|
+
# Given a sequence_id and databases, apply the default (standard)
|
101
|
+
# way to convert a sequence_id into a hyperlink, so that the
|
102
|
+
# blast results include hyperlinks.
|
103
|
+
def construct_standard_sequence_hyperlink(options)
|
104
|
+
if options[:sequence_id].match(/^[^ ]/) #if there is a space right after the '>', makeblastdb was run without -parse_seqids
|
105
|
+
# By default, add a link to a fasta file of the sequence (if makeblastdb was called with -parse_seqids)
|
106
|
+
complete_id = options[:sequence_id][/^(\S+)\s*.*/, 1] # get id part
|
107
|
+
id = complete_id.include?('|') ? complete_id.split('|')[1] : complete_id.split('|')[0]
|
108
|
+
@all_retrievable_ids ||= []
|
109
|
+
@all_retrievable_ids.push(id)
|
110
|
+
|
111
|
+
link = "/get_sequence/?id=#{id}&db=#{options[:databases].join(' ')}" # several dbs... separate by ' '
|
112
|
+
return link
|
113
|
+
else
|
114
|
+
# do nothing - link == nil means no link will be incorporated
|
115
|
+
return nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module SequenceServer
|
2
|
+
# We change Logging format so that it is consistent with Sinatra's
|
3
|
+
class SinatraLikeLogFormatter < Logger::Formatter
|
4
|
+
MyFormat = "[%s] %s %s\n"
|
5
|
+
def initialize
|
6
|
+
self.datetime_format = "%Y-%m-%d %H:%M:%S"
|
7
|
+
end
|
8
|
+
def call(severity, time, progname, msg)
|
9
|
+
MyFormat % [format_datetime(time), severity, msg2str(msg)]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|