sequenceserver 0.6.7

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sequenceserver might be problematic. Click here for more details.

@@ -0,0 +1,127 @@
1
+ require 'sequenceserver/database'
2
+
3
+ module SequenceServer
4
+ module Helpers
5
+ module SystemHelpers
6
+ # Scan the given directory for blast executables. Passing `nil` scans the
7
+ # system `PATH`.
8
+ # ---
9
+ # Arguments:
10
+ # * bin(String) - absolute path to the directory containing blast binaries
11
+ # ---
12
+ # Returns:
13
+ # * a hash of blast methods, and their corresponding absolute path
14
+ # ---
15
+ # Raises:
16
+ # * IOError - if the executables can't be found
17
+ #
18
+ # > scan_blast_executables('/home/yeban/bin')
19
+ # => { "blastx"=>"/home/yeban/bin/blastx",
20
+ # "blastn"=>"/home/yeban/bin/blastn",
21
+ # ...
22
+ # }
23
+ def scan_blast_executables(bin)
24
+ if bin and not File.directory?(bin)
25
+ raise IOError, "Could not find '#{bin}' defined in config.yml."
26
+ end
27
+
28
+ binaries = {}
29
+ %w|blastn blastp blastx tblastn tblastx blastdbcmd makeblastdb blast_formatter|.each do |method|
30
+ path = File.join(bin, method) rescue method
31
+ if command?(path)
32
+ binaries[method] = path
33
+ else
34
+ blasturl = 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download'
35
+ raise IOError, "Could not find blast binaries. You may need to
36
+ install BLAST+ from #{blasturl}. And/or point config.yml to blast's
37
+ bin directory."
38
+ end
39
+ end
40
+
41
+ #LOG.info("Config bin dir: #{bin}")
42
+ binaries
43
+ end
44
+
45
+ # Scan the given directory (including subdirectory) for blast databases.
46
+ # ---
47
+ # Arguments:
48
+ # * db_root(String) - absolute path to the blast databases
49
+ # ---
50
+ # Returns:
51
+ # * a hash of sorted blast databases grouped by database type:
52
+ # protein, or nucleotide
53
+ # ---
54
+ # Raises:
55
+ # * IOError - if no database can be found
56
+ #
57
+ # > scan_blast_db('/home/yeban/blast_db')
58
+ # => { "protein" => [], "nucleotide" => [] }
59
+ def scan_blast_db(db_root, blastdbcmd = 'blastdbcmd')
60
+ raise IOError, "Database directory doesn't exist: #{db_root}" unless File.directory?( db_root )
61
+
62
+ find_dbs_command = %|#{blastdbcmd} -recursive -list #{db_root} -list_outfmt "%p %f %t" 2>&1|
63
+
64
+ begin
65
+ db_list = %x|#{find_dbs_command}|
66
+ if db_list.empty?
67
+ raise IOError, "No formatted blast databases found in '#{ db_root }'."
68
+ end
69
+ rescue => e
70
+ puts '', e.to_s
71
+
72
+ print "Do you want to format your blast databases now? [Y/n]: "
73
+ choice = gets.chomp[0,1].downcase
74
+
75
+ unless choice == 'n'
76
+ database_formatter = File.join(settings.root, 'database_formatter.rb')
77
+ system("#{database_formatter} #{db_root}")
78
+ retry
79
+ else
80
+ raise # let the caller decide what to do if database discovery fails
81
+ end
82
+ end
83
+
84
+ if db_list.match(/BLAST Database error/)
85
+ raise IOError, "Error parsing blast databases.\n" + "Tried: '#{find_dbs_command}'\n"+
86
+ "It crashed with the following error: '#{db_list}'\n" +
87
+ "Try reformatting databases using makeblastdb.\n"
88
+ end
89
+
90
+ db = {}
91
+
92
+ db_list.each_line do |line|
93
+ next if line.empty? # required for BLAST+ 2.2.22
94
+ type, name, *title = line.split(' ')
95
+ type = type.downcase
96
+ name = name.freeze
97
+ title = title.join(' ').freeze
98
+ #LOG.info("Found #{type} database: #{title} at #{name}")
99
+ (db[type] ||= []) << Database.new(name, title)
100
+ end
101
+
102
+
103
+ # the erb would fail as calling nil.each_with_index if a dbtype was undefined.
104
+ db['protein'] = [] unless db.keys.include?('protein')
105
+ db['nucleotide'] = [] unless db.keys.include?('nucleotide')
106
+
107
+ # sort the list of dbs
108
+ db['protein'].sort!
109
+ db['nucleotide'].sort!
110
+
111
+ db
112
+ end
113
+
114
+ private
115
+
116
+ # check if the given command exists and is executable
117
+ # returns True if all is good.
118
+ def command?(command)
119
+ system("which #{command} > /dev/null 2>&1")
120
+ end
121
+ end
122
+
123
+ def self.included(klass)
124
+ klass.extend SystemHelpers
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,119 @@
1
+ module SequenceServer
2
+ # Module to collect some sequence-related helper functions
3
+ module SequenceHelpers
4
+
5
+ # copied from bioruby's Bio::Sequence
6
+ # returns a Hash. Eg: composition("asdfasdfffffasdf")
7
+ # => {"a"=>3, "d"=>3, "f"=>7, "s"=>3}
8
+ def composition(sequence_string)
9
+ count = Hash.new(0)
10
+ sequence_string.scan(/./) do |x|
11
+ count[x] += 1
12
+ end
13
+ return count
14
+ end
15
+
16
+ # Strips all non-letter characters. guestimates sequence based on that.
17
+ # If less than 10 useable characters... returns nil
18
+ # If more than 90% ACGTU returns :nucleotide. else returns :protein
19
+ def guess_sequence_type(sequence_string)
20
+ cleaned_sequence = sequence_string.gsub(/[^A-Z]/i, '') # removing non-letter characters
21
+ cleaned_sequence.gsub!(/[NX]/i, '') # removing ambiguous characters
22
+
23
+ return nil if cleaned_sequence.length < 10 # conservative
24
+
25
+ composition = composition(cleaned_sequence)
26
+ composition_NAs = composition.select { |character, count|character.match(/[ACGTU]/i) } # only putative NAs
27
+ putative_NA_counts = composition_NAs.collect { |key_value_array| key_value_array[1] } # only count, not char
28
+ putative_NA_sum = putative_NA_counts.inject { |sum, n| sum + n } # count of all putative NA
29
+ putative_NA_sum = 0 if putative_NA_sum.nil?
30
+
31
+ if putative_NA_sum > (0.9 * cleaned_sequence.length)
32
+ return :nucleotide
33
+ else
34
+ return :protein
35
+ end
36
+ end
37
+
38
+ # splits input at putative fasta definition lines (like ">adsfadsf"), guesses sequence type for each sequence.
39
+ # if not enough sequence to determine, returns nil.
40
+ # if 2 kinds of sequence mixed together, raises ArgumentError
41
+ # otherwise, returns :nucleotide or :protein
42
+ def type_of_sequences(fasta_format_string)
43
+ # the first sequence does not need to have a fasta definition line
44
+ sequences = fasta_format_string.split(/^>.*$/).delete_if { |seq| seq.empty? }
45
+
46
+ # get all sequence types
47
+ sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }.uniq.compact
48
+
49
+ return nil if sequence_types.empty?
50
+
51
+ if sequence_types.length == 1
52
+ return sequence_types.first # there is only one (but yes its an array)
53
+ else
54
+ raise ArgumentError, "Insufficient info to determine sequence type. Cleaned queries are: #{ sequences.to_s }"
55
+ end
56
+ end
57
+
58
+ # Return the database type that can be used for a given blast method.
59
+ # db_type_for("blastp") => :protein
60
+ # db_type_for("tblastn") => :nucleotide
61
+ # db_type_for(nil) => nil
62
+ def db_type_for(blast_method)
63
+ case blast_method
64
+ when 'blastp', 'blastx'
65
+ :protein
66
+ when 'blastn', 'tblastx', 'tblastn'
67
+ :nucleotide
68
+ end
69
+ end
70
+
71
+ # Return the blast methods that can be used for a given type of sequence.
72
+ # blast_methods_for(:protein) => ['blastp', 'tblastn']
73
+ # blast_methods_for(:nucleotide) => ['blastn','tblastx','blastx']
74
+ # blast_methods_for(nil) => ['blastp', 'tblastn','blastn','tblastx','blastx']
75
+ def blast_methods_for(seq_type)
76
+ case seq_type
77
+ when :protein
78
+ ['blastp', 'tblastn']
79
+ when :nucleotide
80
+ ['blastn','tblastx','blastx']
81
+ else # Sequence type not predicted, so don't make any assumptions about the blast method
82
+ ['blastp', 'tblastn','blastn','tblastx','blastx']
83
+ end
84
+ end
85
+
86
+ def sequence_from_blastdb(ids, db) # helpful when displaying parsed blast results
87
+ # we know how to handle an Array of ids
88
+ ids = ids.join(',') if ids.is_a? Array
89
+
90
+ # we don't know what to do if the arguments ain't String
91
+ raise TypeError unless ids.is_a? String and db.is_a? String
92
+
93
+ # query now!
94
+ #
95
+ # If `blastdbcmd` throws error, we assume sequence not found.
96
+ blastdbcmd = settings.binaries['blastdbcmd']
97
+ command = %x|#{blastdbcmd} -db #{db} -entry #{ids} 2> /dev/null|
98
+ end
99
+
100
+ # Given a sequence_id and databases, apply the default (standard)
101
+ # way to convert a sequence_id into a hyperlink, so that the
102
+ # blast results include hyperlinks.
103
+ def construct_standard_sequence_hyperlink(options)
104
+ if options[:sequence_id].match(/^[^ ]/) #if there is a space right after the '>', makeblastdb was run without -parse_seqids
105
+ # By default, add a link to a fasta file of the sequence (if makeblastdb was called with -parse_seqids)
106
+ complete_id = options[:sequence_id][/^(\S+)\s*.*/, 1] # get id part
107
+ id = complete_id.include?('|') ? complete_id.split('|')[1] : complete_id.split('|')[0]
108
+ @all_retrievable_ids ||= []
109
+ @all_retrievable_ids.push(id)
110
+
111
+ link = "/get_sequence/?id=#{id}&db=#{options[:databases].join(' ')}" # several dbs... separate by ' '
112
+ return link
113
+ else
114
+ # do nothing - link == nil means no link will be incorporated
115
+ return nil
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,12 @@
1
+ module SequenceServer
2
+ # We change Logging format so that it is consistent with Sinatra's
3
+ class SinatraLikeLogFormatter < Logger::Formatter
4
+ MyFormat = "[%s] %s %s\n"
5
+ def initialize
6
+ self.datetime_format = "%Y-%m-%d %H:%M:%S"
7
+ end
8
+ def call(severity, time, progname, msg)
9
+ MyFormat % [format_datetime(time), severity, msg2str(msg)]
10
+ end
11
+ end
12
+ end