sequenceserver 0.6.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,127 @@
1
+ require 'sequenceserver/database'
2
+
3
+ module SequenceServer
4
+ module Helpers
5
+ module SystemHelpers
6
+ # Scan the given directory for blast executables. Passing `nil` scans the
7
+ # system `PATH`.
8
+ # ---
9
+ # Arguments:
10
+ # * bin(String) - absolute path to the directory containing blast binaries
11
+ # ---
12
+ # Returns:
13
+ # * a hash of blast methods, and their corresponding absolute path
14
+ # ---
15
+ # Raises:
16
+ # * IOError - if the executables can't be found
17
+ #
18
+ # > scan_blast_executables('/home/yeban/bin')
19
+ # => { "blastx"=>"/home/yeban/bin/blastx",
20
+ # "blastn"=>"/home/yeban/bin/blastn",
21
+ # ...
22
+ # }
23
+ def scan_blast_executables(bin)
24
+ if bin and not File.directory?(bin)
25
+ raise IOError, "Could not find '#{bin}' defined in config.yml."
26
+ end
27
+
28
+ binaries = {}
29
+ %w|blastn blastp blastx tblastn tblastx blastdbcmd makeblastdb blast_formatter|.each do |method|
30
+ path = File.join(bin, method) rescue method
31
+ if command?(path)
32
+ binaries[method] = path
33
+ else
34
+ blasturl = 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download'
35
+ raise IOError, "Could not find blast binaries. You may need to
36
+ install BLAST+ from #{blasturl}. And/or point config.yml to blast's
37
+ bin directory."
38
+ end
39
+ end
40
+
41
+ #LOG.info("Config bin dir: #{bin}")
42
+ binaries
43
+ end
44
+
45
+ # Scan the given directory (including subdirectory) for blast databases.
46
+ # ---
47
+ # Arguments:
48
+ # * db_root(String) - absolute path to the blast databases
49
+ # ---
50
+ # Returns:
51
+ # * a hash of sorted blast databases grouped by database type:
52
+ # protein, or nucleotide
53
+ # ---
54
+ # Raises:
55
+ # * IOError - if no database can be found
56
+ #
57
+ # > scan_blast_db('/home/yeban/blast_db')
58
+ # => { "protein" => [], "nucleotide" => [] }
59
+ def scan_blast_db(db_root, blastdbcmd = 'blastdbcmd')
60
+ raise IOError, "Database directory doesn't exist: #{db_root}" unless File.directory?( db_root )
61
+
62
+ find_dbs_command = %|#{blastdbcmd} -recursive -list #{db_root} -list_outfmt "%p %f %t" 2>&1|
63
+
64
+ begin
65
+ db_list = %x|#{find_dbs_command}|
66
+ if db_list.empty?
67
+ raise IOError, "No formatted blast databases found in '#{ db_root }'."
68
+ end
69
+ rescue => e
70
+ puts '', e.to_s
71
+
72
+ print "Do you want to format your blast databases now? [Y/n]: "
73
+ choice = gets.chomp[0,1].downcase
74
+
75
+ unless choice == 'n'
76
+ database_formatter = File.join(settings.root, 'database_formatter.rb')
77
+ system("#{database_formatter} #{db_root}")
78
+ retry
79
+ else
80
+ raise # let the caller decide what to do if database discovery fails
81
+ end
82
+ end
83
+
84
+ if db_list.match(/BLAST Database error/)
85
+ raise IOError, "Error parsing blast databases.\n" + "Tried: '#{find_dbs_command}'\n"+
86
+ "It crashed with the following error: '#{db_list}'\n" +
87
+ "Try reformatting databases using makeblastdb.\n"
88
+ end
89
+
90
+ db = {}
91
+
92
+ db_list.each_line do |line|
93
+ next if line.empty? # required for BLAST+ 2.2.22
94
+ type, name, *title = line.split(' ')
95
+ type = type.downcase
96
+ name = name.freeze
97
+ title = title.join(' ').freeze
98
+ #LOG.info("Found #{type} database: #{title} at #{name}")
99
+ (db[type] ||= []) << Database.new(name, title)
100
+ end
101
+
102
+
103
+ # the erb would fail as calling nil.each_with_index if a dbtype was undefined.
104
+ db['protein'] = [] unless db.keys.include?('protein')
105
+ db['nucleotide'] = [] unless db.keys.include?('nucleotide')
106
+
107
+ # sort the list of dbs
108
+ db['protein'].sort!
109
+ db['nucleotide'].sort!
110
+
111
+ db
112
+ end
113
+
114
+ private
115
+
116
+ # check if the given command exists and is executable
117
+ # returns True if all is good.
118
+ def command?(command)
119
+ system("which #{command} > /dev/null 2>&1")
120
+ end
121
+ end
122
+
123
+ def self.included(klass)
124
+ klass.extend SystemHelpers
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,119 @@
1
+ module SequenceServer
2
+ # Module to collect some sequence-related helper functions
3
+ module SequenceHelpers
4
+
5
+ # copied from bioruby's Bio::Sequence
6
+ # returns a Hash. Eg: composition("asdfasdfffffasdf")
7
+ # => {"a"=>3, "d"=>3, "f"=>7, "s"=>3}
8
+ def composition(sequence_string)
9
+ count = Hash.new(0)
10
+ sequence_string.scan(/./) do |x|
11
+ count[x] += 1
12
+ end
13
+ return count
14
+ end
15
+
16
+ # Strips all non-letter characters. guestimates sequence based on that.
17
+ # If less than 10 useable characters... returns nil
18
+ # If more than 90% ACGTU returns :nucleotide. else returns :protein
19
+ def guess_sequence_type(sequence_string)
20
+ cleaned_sequence = sequence_string.gsub(/[^A-Z]/i, '') # removing non-letter characters
21
+ cleaned_sequence.gsub!(/[NX]/i, '') # removing ambiguous characters
22
+
23
+ return nil if cleaned_sequence.length < 10 # conservative
24
+
25
+ composition = composition(cleaned_sequence)
26
+ composition_NAs = composition.select { |character, count|character.match(/[ACGTU]/i) } # only putative NAs
27
+ putative_NA_counts = composition_NAs.collect { |key_value_array| key_value_array[1] } # only count, not char
28
+ putative_NA_sum = putative_NA_counts.inject { |sum, n| sum + n } # count of all putative NA
29
+ putative_NA_sum = 0 if putative_NA_sum.nil?
30
+
31
+ if putative_NA_sum > (0.9 * cleaned_sequence.length)
32
+ return :nucleotide
33
+ else
34
+ return :protein
35
+ end
36
+ end
37
+
38
+ # splits input at putative fasta definition lines (like ">adsfadsf"), guesses sequence type for each sequence.
39
+ # if not enough sequence to determine, returns nil.
40
+ # if 2 kinds of sequence mixed together, raises ArgumentError
41
+ # otherwise, returns :nucleotide or :protein
42
+ def type_of_sequences(fasta_format_string)
43
+ # the first sequence does not need to have a fasta definition line
44
+ sequences = fasta_format_string.split(/^>.*$/).delete_if { |seq| seq.empty? }
45
+
46
+ # get all sequence types
47
+ sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }.uniq.compact
48
+
49
+ return nil if sequence_types.empty?
50
+
51
+ if sequence_types.length == 1
52
+ return sequence_types.first # there is only one (but yes its an array)
53
+ else
54
+ raise ArgumentError, "Insufficient info to determine sequence type. Cleaned queries are: #{ sequences.to_s }"
55
+ end
56
+ end
57
+
58
+ # Return the database type that can be used for a given blast method.
59
+ # db_type_for("blastp") => :protein
60
+ # db_type_for("tblastn") => :nucleotide
61
+ # db_type_for(nil) => nil
62
+ def db_type_for(blast_method)
63
+ case blast_method
64
+ when 'blastp', 'blastx'
65
+ :protein
66
+ when 'blastn', 'tblastx', 'tblastn'
67
+ :nucleotide
68
+ end
69
+ end
70
+
71
+ # Return the blast methods that can be used for a given type of sequence.
72
+ # blast_methods_for(:protein) => ['blastp', 'tblastn']
73
+ # blast_methods_for(:nucleotide) => ['blastn','tblastx','blastx']
74
+ # blast_methods_for(nil) => ['blastp', 'tblastn','blastn','tblastx','blastx']
75
+ def blast_methods_for(seq_type)
76
+ case seq_type
77
+ when :protein
78
+ ['blastp', 'tblastn']
79
+ when :nucleotide
80
+ ['blastn','tblastx','blastx']
81
+ else # Sequence type not predicted, so don't make any assumptions about the blast method
82
+ ['blastp', 'tblastn','blastn','tblastx','blastx']
83
+ end
84
+ end
85
+
86
+ def sequence_from_blastdb(ids, db) # helpful when displaying parsed blast results
87
+ # we know how to handle an Array of ids
88
+ ids = ids.join(',') if ids.is_a? Array
89
+
90
+ # we don't know what to do if the arguments ain't String
91
+ raise TypeError unless ids.is_a? String and db.is_a? String
92
+
93
+ # query now!
94
+ #
95
+ # If `blastdbcmd` throws error, we assume sequence not found.
96
+ blastdbcmd = settings.binaries['blastdbcmd']
97
+ command = %x|#{blastdbcmd} -db #{db} -entry #{ids} 2> /dev/null|
98
+ end
99
+
100
+ # Given a sequence_id and databases, apply the default (standard)
101
+ # way to convert a sequence_id into a hyperlink, so that the
102
+ # blast results include hyperlinks.
103
+ def construct_standard_sequence_hyperlink(options)
104
+ if options[:sequence_id].match(/^[^ ]/) #if there is a space right after the '>', makeblastdb was run without -parse_seqids
105
+ # By default, add a link to a fasta file of the sequence (if makeblastdb was called with -parse_seqids)
106
+ complete_id = options[:sequence_id][/^(\S+)\s*.*/, 1] # get id part
107
+ id = complete_id.include?('|') ? complete_id.split('|')[1] : complete_id.split('|')[0]
108
+ @all_retrievable_ids ||= []
109
+ @all_retrievable_ids.push(id)
110
+
111
+ link = "/get_sequence/?id=#{id}&db=#{options[:databases].join(' ')}" # several dbs... separate by ' '
112
+ return link
113
+ else
114
+ # do nothing - link == nil means no link will be incorporated
115
+ return nil
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,12 @@
1
+ module SequenceServer
2
+ # We change Logging format so that it is consistent with Sinatra's
3
+ class SinatraLikeLogFormatter < Logger::Formatter
4
+ MyFormat = "[%s] %s %s\n"
5
+ def initialize
6
+ self.datetime_format = "%Y-%m-%d %H:%M:%S"
7
+ end
8
+ def call(severity, time, progname, msg)
9
+ MyFormat % [format_datetime(time), severity, msg2str(msg)]
10
+ end
11
+ end
12
+ end