sequenceserver 0.6.7
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of sequenceserver might be problematic. Click here for more details.
- data/Gemfile +4 -0
- data/LICENSE.txt +64 -0
- data/README.txt +5 -0
- data/bin/database_formatter +195 -0
- data/bin/sequenceserver +12 -0
- data/config.ru +5 -0
- data/example.config.yml +39 -0
- data/lib/sequenceserver/blast.rb +211 -0
- data/lib/sequenceserver/customisation.rb +60 -0
- data/lib/sequenceserver/database.rb +23 -0
- data/lib/sequenceserver/helpers.rb +127 -0
- data/lib/sequenceserver/sequencehelpers.rb +119 -0
- data/lib/sequenceserver/sinatralikeloggerformatter.rb +12 -0
- data/lib/sequenceserver.rb +525 -0
- data/public/blastResult.js +99 -0
- data/public/css/bootstrap.min.css +330 -0
- data/public/css/custom.css +92 -0
- data/public/js/jquery.enablePlaceholder.min.js +11 -0
- data/public/js/jquery.js +18 -0
- data/public/js/search.js +148 -0
- data/sequenceserver.gemspec +42 -0
- data/tests/database/nucleotide/Sinvicta2-2-3.cdna.subset.fasta +5486 -0
- data/tests/database/nucleotide/Sinvicta2-2-3.cdna.subset.fasta.nhr +0 -0
- data/tests/database/nucleotide/Sinvicta2-2-3.cdna.subset.fasta.nin +0 -0
- data/tests/database/nucleotide/Sinvicta2-2-3.cdna.subset.fasta.nsq +0 -0
- data/tests/database/protein/Sinvicta2-2-3.prot.subset.fasta +6449 -0
- data/tests/database/protein/Sinvicta2-2-3.prot.subset.fasta.phr +0 -0
- data/tests/database/protein/Sinvicta2-2-3.prot.subset.fasta.pin +0 -0
- data/tests/database/protein/Sinvicta2-2-3.prot.subset.fasta.psq +0 -0
- data/tests/test_sequencehelpers.rb +85 -0
- data/views/500.erb +22 -0
- data/views/search.erb +221 -0
- metadata +141 -0
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'sequenceserver/database'
|
2
|
+
|
3
|
+
module SequenceServer
|
4
|
+
module Helpers
|
5
|
+
module SystemHelpers
|
6
|
+
# Scan the given directory for blast executables. Passing `nil` scans the
|
7
|
+
# system `PATH`.
|
8
|
+
# ---
|
9
|
+
# Arguments:
|
10
|
+
# * bin(String) - absolute path to the directory containing blast binaries
|
11
|
+
# ---
|
12
|
+
# Returns:
|
13
|
+
# * a hash of blast methods, and their corresponding absolute path
|
14
|
+
# ---
|
15
|
+
# Raises:
|
16
|
+
# * IOError - if the executables can't be found
|
17
|
+
#
|
18
|
+
# > scan_blast_executables('/home/yeban/bin')
|
19
|
+
# => { "blastx"=>"/home/yeban/bin/blastx",
|
20
|
+
# "blastn"=>"/home/yeban/bin/blastn",
|
21
|
+
# ...
|
22
|
+
# }
|
23
|
+
def scan_blast_executables(bin)
|
24
|
+
if bin and not File.directory?(bin)
|
25
|
+
raise IOError, "Could not find '#{bin}' defined in config.yml."
|
26
|
+
end
|
27
|
+
|
28
|
+
binaries = {}
|
29
|
+
%w|blastn blastp blastx tblastn tblastx blastdbcmd makeblastdb blast_formatter|.each do |method|
|
30
|
+
path = File.join(bin, method) rescue method
|
31
|
+
if command?(path)
|
32
|
+
binaries[method] = path
|
33
|
+
else
|
34
|
+
blasturl = 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download'
|
35
|
+
raise IOError, "Could not find blast binaries. You may need to
|
36
|
+
install BLAST+ from #{blasturl}. And/or point config.yml to blast's
|
37
|
+
bin directory."
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#LOG.info("Config bin dir: #{bin}")
|
42
|
+
binaries
|
43
|
+
end
|
44
|
+
|
45
|
+
# Scan the given directory (including subdirectory) for blast databases.
|
46
|
+
# ---
|
47
|
+
# Arguments:
|
48
|
+
# * db_root(String) - absolute path to the blast databases
|
49
|
+
# ---
|
50
|
+
# Returns:
|
51
|
+
# * a hash of sorted blast databases grouped by database type:
|
52
|
+
# protein, or nucleotide
|
53
|
+
# ---
|
54
|
+
# Raises:
|
55
|
+
# * IOError - if no database can be found
|
56
|
+
#
|
57
|
+
# > scan_blast_db('/home/yeban/blast_db')
|
58
|
+
# => { "protein" => [], "nucleotide" => [] }
|
59
|
+
def scan_blast_db(db_root, blastdbcmd = 'blastdbcmd')
|
60
|
+
raise IOError, "Database directory doesn't exist: #{db_root}" unless File.directory?( db_root )
|
61
|
+
|
62
|
+
find_dbs_command = %|#{blastdbcmd} -recursive -list #{db_root} -list_outfmt "%p %f %t" 2>&1|
|
63
|
+
|
64
|
+
begin
|
65
|
+
db_list = %x|#{find_dbs_command}|
|
66
|
+
if db_list.empty?
|
67
|
+
raise IOError, "No formatted blast databases found in '#{ db_root }'."
|
68
|
+
end
|
69
|
+
rescue => e
|
70
|
+
puts '', e.to_s
|
71
|
+
|
72
|
+
print "Do you want to format your blast databases now? [Y/n]: "
|
73
|
+
choice = gets.chomp[0,1].downcase
|
74
|
+
|
75
|
+
unless choice == 'n'
|
76
|
+
database_formatter = File.join(settings.root, 'database_formatter.rb')
|
77
|
+
system("#{database_formatter} #{db_root}")
|
78
|
+
retry
|
79
|
+
else
|
80
|
+
raise # let the caller decide what to do if database discovery fails
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if db_list.match(/BLAST Database error/)
|
85
|
+
raise IOError, "Error parsing blast databases.\n" + "Tried: '#{find_dbs_command}'\n"+
|
86
|
+
"It crashed with the following error: '#{db_list}'\n" +
|
87
|
+
"Try reformatting databases using makeblastdb.\n"
|
88
|
+
end
|
89
|
+
|
90
|
+
db = {}
|
91
|
+
|
92
|
+
db_list.each_line do |line|
|
93
|
+
next if line.empty? # required for BLAST+ 2.2.22
|
94
|
+
type, name, *title = line.split(' ')
|
95
|
+
type = type.downcase
|
96
|
+
name = name.freeze
|
97
|
+
title = title.join(' ').freeze
|
98
|
+
#LOG.info("Found #{type} database: #{title} at #{name}")
|
99
|
+
(db[type] ||= []) << Database.new(name, title)
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
# the erb would fail as calling nil.each_with_index if a dbtype was undefined.
|
104
|
+
db['protein'] = [] unless db.keys.include?('protein')
|
105
|
+
db['nucleotide'] = [] unless db.keys.include?('nucleotide')
|
106
|
+
|
107
|
+
# sort the list of dbs
|
108
|
+
db['protein'].sort!
|
109
|
+
db['nucleotide'].sort!
|
110
|
+
|
111
|
+
db
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
# check if the given command exists and is executable
|
117
|
+
# returns True if all is good.
|
118
|
+
def command?(command)
|
119
|
+
system("which #{command} > /dev/null 2>&1")
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def self.included(klass)
|
124
|
+
klass.extend SystemHelpers
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
module SequenceServer
|
2
|
+
# Module to collect some sequence-related helper functions
|
3
|
+
module SequenceHelpers
|
4
|
+
|
5
|
+
# copied from bioruby's Bio::Sequence
|
6
|
+
# returns a Hash. Eg: composition("asdfasdfffffasdf")
|
7
|
+
# => {"a"=>3, "d"=>3, "f"=>7, "s"=>3}
|
8
|
+
def composition(sequence_string)
|
9
|
+
count = Hash.new(0)
|
10
|
+
sequence_string.scan(/./) do |x|
|
11
|
+
count[x] += 1
|
12
|
+
end
|
13
|
+
return count
|
14
|
+
end
|
15
|
+
|
16
|
+
# Strips all non-letter characters. guestimates sequence based on that.
|
17
|
+
# If less than 10 useable characters... returns nil
|
18
|
+
# If more than 90% ACGTU returns :nucleotide. else returns :protein
|
19
|
+
def guess_sequence_type(sequence_string)
|
20
|
+
cleaned_sequence = sequence_string.gsub(/[^A-Z]/i, '') # removing non-letter characters
|
21
|
+
cleaned_sequence.gsub!(/[NX]/i, '') # removing ambiguous characters
|
22
|
+
|
23
|
+
return nil if cleaned_sequence.length < 10 # conservative
|
24
|
+
|
25
|
+
composition = composition(cleaned_sequence)
|
26
|
+
composition_NAs = composition.select { |character, count|character.match(/[ACGTU]/i) } # only putative NAs
|
27
|
+
putative_NA_counts = composition_NAs.collect { |key_value_array| key_value_array[1] } # only count, not char
|
28
|
+
putative_NA_sum = putative_NA_counts.inject { |sum, n| sum + n } # count of all putative NA
|
29
|
+
putative_NA_sum = 0 if putative_NA_sum.nil?
|
30
|
+
|
31
|
+
if putative_NA_sum > (0.9 * cleaned_sequence.length)
|
32
|
+
return :nucleotide
|
33
|
+
else
|
34
|
+
return :protein
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# splits input at putative fasta definition lines (like ">adsfadsf"), guesses sequence type for each sequence.
|
39
|
+
# if not enough sequence to determine, returns nil.
|
40
|
+
# if 2 kinds of sequence mixed together, raises ArgumentError
|
41
|
+
# otherwise, returns :nucleotide or :protein
|
42
|
+
def type_of_sequences(fasta_format_string)
|
43
|
+
# the first sequence does not need to have a fasta definition line
|
44
|
+
sequences = fasta_format_string.split(/^>.*$/).delete_if { |seq| seq.empty? }
|
45
|
+
|
46
|
+
# get all sequence types
|
47
|
+
sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }.uniq.compact
|
48
|
+
|
49
|
+
return nil if sequence_types.empty?
|
50
|
+
|
51
|
+
if sequence_types.length == 1
|
52
|
+
return sequence_types.first # there is only one (but yes its an array)
|
53
|
+
else
|
54
|
+
raise ArgumentError, "Insufficient info to determine sequence type. Cleaned queries are: #{ sequences.to_s }"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Return the database type that can be used for a given blast method.
|
59
|
+
# db_type_for("blastp") => :protein
|
60
|
+
# db_type_for("tblastn") => :nucleotide
|
61
|
+
# db_type_for(nil) => nil
|
62
|
+
def db_type_for(blast_method)
|
63
|
+
case blast_method
|
64
|
+
when 'blastp', 'blastx'
|
65
|
+
:protein
|
66
|
+
when 'blastn', 'tblastx', 'tblastn'
|
67
|
+
:nucleotide
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Return the blast methods that can be used for a given type of sequence.
|
72
|
+
# blast_methods_for(:protein) => ['blastp', 'tblastn']
|
73
|
+
# blast_methods_for(:nucleotide) => ['blastn','tblastx','blastx']
|
74
|
+
# blast_methods_for(nil) => ['blastp', 'tblastn','blastn','tblastx','blastx']
|
75
|
+
def blast_methods_for(seq_type)
|
76
|
+
case seq_type
|
77
|
+
when :protein
|
78
|
+
['blastp', 'tblastn']
|
79
|
+
when :nucleotide
|
80
|
+
['blastn','tblastx','blastx']
|
81
|
+
else # Sequence type not predicted, so don't make any assumptions about the blast method
|
82
|
+
['blastp', 'tblastn','blastn','tblastx','blastx']
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def sequence_from_blastdb(ids, db) # helpful when displaying parsed blast results
|
87
|
+
# we know how to handle an Array of ids
|
88
|
+
ids = ids.join(',') if ids.is_a? Array
|
89
|
+
|
90
|
+
# we don't know what to do if the arguments ain't String
|
91
|
+
raise TypeError unless ids.is_a? String and db.is_a? String
|
92
|
+
|
93
|
+
# query now!
|
94
|
+
#
|
95
|
+
# If `blastdbcmd` throws error, we assume sequence not found.
|
96
|
+
blastdbcmd = settings.binaries['blastdbcmd']
|
97
|
+
command = %x|#{blastdbcmd} -db #{db} -entry #{ids} 2> /dev/null|
|
98
|
+
end
|
99
|
+
|
100
|
+
# Given a sequence_id and databases, apply the default (standard)
|
101
|
+
# way to convert a sequence_id into a hyperlink, so that the
|
102
|
+
# blast results include hyperlinks.
|
103
|
+
def construct_standard_sequence_hyperlink(options)
|
104
|
+
if options[:sequence_id].match(/^[^ ]/) #if there is a space right after the '>', makeblastdb was run without -parse_seqids
|
105
|
+
# By default, add a link to a fasta file of the sequence (if makeblastdb was called with -parse_seqids)
|
106
|
+
complete_id = options[:sequence_id][/^(\S+)\s*.*/, 1] # get id part
|
107
|
+
id = complete_id.include?('|') ? complete_id.split('|')[1] : complete_id.split('|')[0]
|
108
|
+
@all_retrievable_ids ||= []
|
109
|
+
@all_retrievable_ids.push(id)
|
110
|
+
|
111
|
+
link = "/get_sequence/?id=#{id}&db=#{options[:databases].join(' ')}" # several dbs... separate by ' '
|
112
|
+
return link
|
113
|
+
else
|
114
|
+
# do nothing - link == nil means no link will be incorporated
|
115
|
+
return nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module SequenceServer
|
2
|
+
# We change Logging format so that it is consistent with Sinatra's
|
3
|
+
class SinatraLikeLogFormatter < Logger::Formatter
|
4
|
+
MyFormat = "[%s] %s %s\n"
|
5
|
+
def initialize
|
6
|
+
self.datetime_format = "%Y-%m-%d %H:%M:%S"
|
7
|
+
end
|
8
|
+
def call(severity, time, progname, msg)
|
9
|
+
MyFormat % [format_datetime(time), severity, msg2str(msg)]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|