bio-cnls_screenscraper 0.1.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.2
4
+ - 1.9.3
5
+ - jruby-19mode # JRuby in 1.9 mode
6
+ - rbx-19mode
7
+ # - 1.8.7
8
+ # - jruby-18mode # JRuby in 1.8 mode
9
+ # - rbx-18mode
10
+
11
+ # uncomment this line if your project needs to run something other than `rake`:
12
+ # script: bundle exec rspec spec
data/Gemfile CHANGED
@@ -7,8 +7,8 @@ source "http://rubygems.org"
7
7
  # Include everything needed to run rake, tests, features, etc.
8
8
  group :development do
9
9
  gem "shoulda", ">= 0"
10
- gem "bundler", "~> 1.0.0"
11
- gem "jeweler", "~> 1.5.2"
12
- gem "rcov", ">= 0"
13
- gem "bio", ">= 1.4.1"
10
+ gem "rdoc", "~> 3.12"
11
+ gem "jeweler", "~> 1.8.3"
12
+ gem "bundler", ">= 1.0.21"
13
+ gem "bio", ">= 1.4.2"
14
14
  end
@@ -1,4 +1,4 @@
1
- Copyright (c) 2011 Ben J. Woodcroft
1
+ Copyright (c) 2012 Ben J Woodcroft
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
@@ -1,6 +1,18 @@
1
1
  = bio-cnls_screenscraper
2
2
 
3
- Description goes here.
3
+ bio-cnls_screenscraper is a programmatic biogem interface to http://nls-mapper.iab.keio.ac.jp/cgi-bin/NLS_Mapper_form.cgi - a server for prediction of importin α-dependent nuclear localization signals.
4
+
5
+ First, cache the results for each sequence in your amino acid sequence fasta file. This contacts the cNLS server once for each sequence, waiting 1 second in between so as not to overload the server. Each result is saved as a separate HTML file, so it is best to do this command in an empty directory.
6
+
7
+ mkdir cNLS_cache
8
+ cd cNLS_cache
9
+ bio-nls_screenscraper.rb -h <fasta_file> 2>cNLS_caching.err
10
+
11
+ Then parse these HTML files and collate into a single tab-separated values file. Perhaps best to put the results file not in the cache directory. The parsing uses the default cutoff of 8.0 for monopartite NLSs, and 7.0 for bipartite NLSs.
12
+
13
+ bio-nls_screenscraper.rb -cp >../cNLS_results.csv
14
+
15
+ Some sequences are unacceptable to the cNLS server - sequences that are too short (<19 aa), too long, or contain non-standard amino acids such as 'X'.
4
16
 
5
17
  == Contributing to bio-cnls_screenscraper
6
18
 
data/Rakefile CHANGED
@@ -33,13 +33,6 @@ Rake::TestTask.new(:test) do |test|
33
33
  test.verbose = true
34
34
  end
35
35
 
36
- require 'rcov/rcovtask'
37
- Rcov::RcovTask.new do |test|
38
- test.libs << 'test'
39
- test.pattern = 'test/**/test_*.rb'
40
- test.verbose = true
41
- end
42
-
43
36
  task :default => :test
44
37
 
45
38
  require 'rake/rdoctask'
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.3.0
@@ -0,0 +1,117 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'bio-cnls_screenscraper'
5
+
6
+ # When entering sequences less than this number of amino acids as a query
7
+ # it fails (if less than 10 it tells you, if less than 19 then it silently fails)
8
+ QUERY_LENGTH_MINIMUM = 19
9
+ ACCEPTABLE_AMINO_ACID_CHARACTERS = Bio::AminoAcid::Data::WEIGHT.keys.push('*')
10
+
11
+ options = {
12
+ :verbose => true,
13
+ :cache_html => false,
14
+ :use_cache => false,
15
+ :cutoff_score => nil,
16
+ :print_scores => false,
17
+ }
18
+ o = OptionParser.new do |opts|
19
+ opts.banner = ['',
20
+ 'Usage: bio-cnls_formatter.rb [-qhcsp] [fasta_filename]',
21
+ '\tfasta file can also be piped in on STDIN.',''
22
+ ].join("\n")
23
+
24
+ opts.on('-q','--quiet','Opposite of verbose. Default is not quiet (verbose is on)') do
25
+ options[:verbose] = false
26
+ end
27
+ opts.on('-h','--html','Cache HTML results in the current directory instead of parsing them. Default false.') do
28
+ options[:cache_html] = true
29
+ end
30
+ opts.on('-c','--cached','Parse the cache HTML results (as previously generated using -h/--html) in the current directory. Default false.') do
31
+ options[:use_cache] = true
32
+ end
33
+ opts.on('-s','--score SCORE','Cutoff score to be used when parsing results, between 0 and 10. Used when parsing results, not when querying the server') do |s|
34
+ options[:cutoff_score] = s.to_f
35
+ end
36
+ opts.on('-p','--print-scores','Output scores as well as true/false predictions. Default false.') do |s|
37
+ options[:print_scores] = true
38
+ end
39
+ end
40
+ o.parse!
41
+
42
+ print_result_headers = lambda do
43
+ to_print = [
44
+ 'Name',
45
+ 'Monopartite signal?',
46
+ 'Bipartite signal?'
47
+ ]
48
+ if options[:print_scores]
49
+ to_print.push 'Max monopartite score'
50
+ to_print.push 'Max bipartite score'
51
+ end
52
+
53
+ puts to_print.join("\t")
54
+ end
55
+
56
+ # Define a procedure for printing parsed results so it is more DRY
57
+ print_parsed_results = lambda do |sequence_name, cnls_result, score|
58
+ to_print = [
59
+ sequence_name,
60
+ cnls_result.monopartite_predicted?(score),
61
+ cnls_result.bipartite_predicted?(score)
62
+ ]
63
+ if options[:print_scores]
64
+ to_print.push cnls_result.max_monopartite_score
65
+ to_print.push cnls_result.max_bipartite_score
66
+ end
67
+
68
+ puts to_print.join("\t")
69
+ end
70
+
71
+ # If
72
+ if options[:use_cache]
73
+ print_result_headers.call
74
+ Dir.foreach('.') do |file|
75
+ next if File.directory?(file) #skip '.', '..' etc.
76
+
77
+ begin
78
+ res = Bio::CNLS::Screenscraper.parse_html_result(File.read(file))
79
+ print_parsed_results.call(
80
+ file, res, options[:cutoff_score]
81
+ )
82
+ rescue Exception => e
83
+ $stderr.puts "Failed to parse #{file}: #{e}"
84
+ end
85
+ end
86
+ else
87
+ Bio::FlatFile.foreach(ARGF) do |entry|
88
+ # Sequences are automatically disqualified if they contain characters that are neither amino acids or stop codons
89
+ fails = entry.seq.gsub(/[#{ACCEPTABLE_AMINO_ACID_CHARACTERS.join('')}]/,'')
90
+ if fails.length > 0
91
+ if options[:verbose]
92
+ $stderr.puts "Found unacceptable characters in #{entry.definition}: #{fails}"
93
+ end
94
+ next
95
+
96
+ # Sequence length must be greater than the minimum, excluding
97
+ # stop codons
98
+ elsif entry.seq.gsub(/\*/,'').length < QUERY_LENGTH_MINIMUM
99
+ if options[:verbose]
100
+ $stderr.puts "Query sequence too short (less than #{QUERY_LENGTH_MINIMUM} residues excluding stop codons): #{entry.definition}"
101
+ end
102
+ else
103
+ # This sequence passes, run the prediction on it
104
+ if options[:cache_html]
105
+ res = Bio::CNLS::Screenscraper.get_raw_html_result(entry.seq)
106
+ File.open("#{entry.definition}.html",'w') do |f|
107
+ f.puts res
108
+ end
109
+ $stderr.print '.' if options[:verbose]
110
+ else
111
+ res = Bio::CNLS::Screenscraper.submit(entry.seq)
112
+ print_result_headers.call
113
+ print_parsed_results.call(entry.definition, res, options[:cutoff_score])
114
+ end
115
+ end
116
+ end
117
+ end
@@ -1,293 +1,12 @@
1
- #!/usr/bin/env ruby
1
+ # Please require your code below, respecting the naming conventions in the
2
+ # bioruby directory tree.
3
+ #
4
+ # For example, say you have a plugin named bio-plugin, the only uncommented
5
+ # line in this file would be
6
+ #
7
+ # require 'bio/bio-plugin/plugin'
8
+ #
9
+ # In this file only require other files. Avoid other source code.
10
+
11
+ require 'bio/cnls_screenscraper/cnls_screenscraper.rb'
2
12
 
3
- # A script to take a FASTA file, remove sequences that will fail, and automatically submit it to the cNLS server at http://nls-mapper.iab.keio.ac.jp/cgi-bin/NLS_Mapper_form.cgi
4
- # Unfortunately, the fasta upload seems to fail.
5
- # and format it so that it can be uploaded to the cNLS mapper (classical(?) nuclear localisation signal mapper).
6
- # The fasta output file can be uploaded to
7
- # http://nls-mapper.iab.keio.ac.jp/cgi-bin/NLS_Mapper_form.cgi
8
-
9
- require 'bio'
10
-
11
- module Bio
12
- class CNLS
13
- class Result
14
- attr_accessor :signals
15
-
16
- def initialize
17
- @signals = []
18
- end
19
-
20
- class NLS
21
- attr_accessor :position, :sequence, :score
22
-
23
- # sort by score descending
24
- def <=>(another)
25
- -(@score<=>another.score)
26
- end
27
- end
28
- class MonopartiteNLS<NLS; end
29
- class BipartiteNLS<NLS; end
30
-
31
- # Is this result a positive prediction or negative prediction?
32
- def predicted?
33
- !signals.nil? and !signals.empty?
34
- end
35
-
36
- def monopartite_predicted?(minimum_score=nil)
37
- @signals.each do |s|
38
- if s.kind_of?(MonopartiteNLS)
39
- return true if minimum_score.nil? #if no cutoff, return true
40
- return true if s.score >= minimum_score #otherwise apply the cutoff
41
- end
42
- end
43
- return false
44
- end
45
-
46
- def bipartite_predicted?(minimum_score=nil)
47
- @signals.each do |s|
48
- if s.kind_of?(BipartiteNLS)
49
- return true if minimum_score.nil? #if no cutoff, return true
50
- return true if s.score >= minimum_score #otherwise apply the cutoff
51
- end
52
- end
53
- return false
54
- end
55
-
56
- def max_monopartite_score
57
- max = 0.0
58
- @signals.each do |s|
59
- if s.kind_of?(MonopartiteNLS) and s.score > max
60
- max = s.score
61
- end
62
- end
63
- return max
64
- end
65
-
66
- def max_bipartite_score
67
- max = 0.0
68
- @signals.each do |s|
69
- if s.kind_of?(BipartiteNLS) and s.score > max
70
- max = s.score
71
- end
72
- end
73
- return max
74
- end
75
- end
76
-
77
- # A class used to automatically submit results to the cNLS webserver and parse the HTML results.
78
- class Screenscraper
79
- require 'uri'
80
- require 'net/http'
81
-
82
- ACCEPTABLE_CUTOFFS = %w(2.0 3.0 4.0 5.0 6.0)
83
-
84
- # Contact the cNLS prediction server and submit the amino acid sequence for prediction. Return a Bio::CNLS::Result object. Pause after each round for pause milliseconds, so as not to overload the server.
85
- def self.submit(amino_acid_sequence, cut_off='3.0', seconds_pause=1)
86
- # contact webserver and sleep
87
- html = get_raw_html_result(amino_acid_sequence, cut_off, seconds_pause)
88
-
89
- # Return the parsed HTML as a CNLS::Result object
90
- return parse_html_result(html)
91
- end
92
-
93
- def self.get_raw_html_result(amino_acid_sequence, cut_off='3.0', seconds_pause=1)
94
- unless ACCEPTABLE_CUTOFFS.include?(cut_off)
95
- raise Exception, "Specified cutoff `#{cut_off}' for the cNLS screenscraper is invalid. Valid cutoffs are #{ACCEPTABLE_CUTOFFS.join(', ')}. They are strings, not floating point values."
96
- end
97
-
98
- # retrieve the webpage
99
- res = Net::HTTP.post_form(URI.parse('http://nls-mapper.iab.keio.ac.jp/cgi-bin/NLS_Mapper_y.cgi'),
100
- {'cut_off' => cut_off, 'typedseq' => amino_acid_sequence})
101
-
102
- # if there is an error, raise it
103
- unless res.kind_of?(Net::HTTPOK)
104
- raise Exception, "Failed to retrieve cNLS, internet connectivity problem? Using cutoff/sequence #{cutoff}/#{amino_acid_sequence}"
105
- end
106
-
107
- # pause the specified number of seconds
108
- sleep seconds_pause
109
-
110
- return res.body
111
- end
112
-
113
- # Given HTML corresponding to a result, return a parse object that is more programmatically palatable.
114
- def self.parse_html_result(html)
115
- result = Result.new
116
-
117
- # The mono and bi-partite regular expressions are equivalent except for the Predicted X NLS bit at the beginning, thanksfully. However, they sometimes appear to be slightly different, which is rather odd.
118
- monopartite_regex = /Predicted monopartite NLS<\/th>\s+<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td align="center"><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><\/TR>/i
119
- bipartite_regex = /Predicted bipartite NLS<\/th>\s+<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td align="center"><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><\/TR>/i
120
-
121
- monopartite_no_hits = /Predicted monopartite NLS<\/th>\s*<\/tr>\s*<tr bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/tr>\s*<tr><td><strong><big><code><\/code><\/big><\/strong><\/td><td><strong><big><code><\/code><\/big><\/strong><\/td><td align="center"><strong><big><code><\/code><\/big><\/strong><\/td><\/tr>/i
122
- bipartite_no_hits = /Predicted bipartite NLS<\/th>\s*<\/tr>\s*<tr bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/tr>\s*<tr><td><strong><big><code><\/code><\/big><\/strong><\/td><td><strong><big><code><\/code><\/big><\/strong><\/td><td align="center"><strong><big><code><\/code><\/big><\/strong><\/td><\/tr>/i
123
- monopartite_no_hits2 = /Predicted monopartite NLS<\/th>\s*<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code \/><\/big><\/strong><\/td><td><strong><big><code \/><\/big><\/strong><\/td><td align="center"><strong><big><code \/><\/big><\/strong><\/td><\/TR>/i
124
- bipartite_no_hits2 = /Predicted bipartite NLS<\/th>\s*<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code \/><\/big><\/strong><\/td><td><strong><big><code \/><\/big><\/strong><\/td><td align="center"><strong><big><code \/><\/big><\/strong><\/td><\/TR>/i
125
-
126
- split_regex = /<\/code><\/big><\/strong><br.{0,2}><strong><big><code>/
127
-
128
- # Make sure the sequence isn't too long
129
- if html.match(/Query sequence should be < 5000 aa/)
130
- raise Exception, "Query sequence provided was too long (> 5000 aa)"
131
-
132
- # parse out monopartite signals
133
- elsif matches = html.match(monopartite_regex)
134
- positions = matches[1].split(split_regex)
135
- seqs = matches[2].split(split_regex)
136
- scores = matches[3].split(split_regex)
137
-
138
- positions.each_with_index do |pos, i|
139
- nls = Result::MonopartiteNLS.new
140
- nls.position = pos.to_i
141
- nls.sequence = seqs[i]
142
- nls.score = scores[i].to_f
143
- result.signals.push nls
144
- end
145
- elsif html.match(monopartite_no_hits) or html.match(monopartite_no_hits2)
146
- # do nothing, except for not raising a parsing exception
147
- else
148
- raise Exception, "Could not parse HTML output returned from cNLS prediction server. In particular, looking for monopartite signals, but the whole document is likely problematic.\n#{html}"
149
- end
150
-
151
-
152
- # parse out the bipartite signals
153
- if matches = html.match(bipartite_regex)
154
- positions = matches[1].split(split_regex)
155
- seqs = matches[2].split(split_regex)
156
- scores = matches[3].split(split_regex)
157
-
158
- positions.each_with_index do |pos, i|
159
- nls = Result::BipartiteNLS.new
160
- nls.position = pos.to_i
161
- nls.sequence = seqs[i]
162
- nls.score = scores[i].to_f
163
- result.signals.push nls
164
- end
165
- elsif html.match(bipartite_no_hits) or html.match(bipartite_no_hits2)
166
- # do nothing, except for not raising a parsing exception
167
- else
168
- raise Exception, "Could not parse HTML output returned from cNLS prediction server. In particular, looking for bipartite signals, monopartite signals seemed to be parsed OK.\n#{html}"
169
- end
170
-
171
- return result
172
- end
173
- end
174
- end
175
- end
176
-
177
-
178
-
179
- if __FILE__ == $0
180
- require 'optparse'
181
-
182
- # When entering sequences less than this number of amino acids as a query
183
- # it fails (if less than 10 it tells you, if less than 19 then it silently fails)
184
- QUERY_LENGTH_MINIMUM = 19
185
- ACCEPTABLE_AMINO_ACID_CHARACTERS = Bio::AminoAcid::Data::WEIGHT.keys.push('*')
186
-
187
- options = {
188
- :verbose => true,
189
- :cache_html => false,
190
- :use_cache => false,
191
- :cutoff_score => nil,
192
- :print_scores => false,
193
- }
194
- o = OptionParser.new do |opts|
195
- opts.banner = [
196
- 'Usage: bio-cnls_formatter.rb [-qh] [fasta_filename]',
197
- '\tfasta file can also be piped in on STDIN.'
198
- ]
199
- opts.on('-q','--quiet','Opposite of verbose. Default is not quiet (verbose is on)') do
200
- options[:verbose] = false
201
- end
202
- opts.on('-h','--html','Cache HTML results in the current directory instead of parsing them. Default false.') do
203
- options[:cache_html] = true
204
- end
205
- opts.on('-c','--cached','Parse the cache HTML results (as previously generated using -h/--html) in the current directory. Default false.') do
206
- options[:use_cache] = true
207
- end
208
- opts.on('-s','--score SCORE','Cutoff score to be used when parsing results, between 0 and 10. Used when parsing results, not when querying the server') do |s|
209
- options[:cutoff_score] = s.to_f
210
- end
211
- opts.on('-p','--print-scores','Output scores as well as true/false predictions. Default false.') do |s|
212
- options[:print_scores] = true
213
- end
214
- end
215
- o.parse!
216
-
217
- print_result_headers = lambda do
218
- to_print = [
219
- 'Name',
220
- 'Monopartite signal?',
221
- 'Bipartite signal?'
222
- ]
223
- if options[:print_scores]
224
- to_print.push 'Max monopartite score'
225
- to_print.push 'Max bipartite score'
226
- end
227
-
228
- puts to_print.join("\t")
229
- end
230
-
231
- # Define a procedure for printing parsed results so it is more DRY
232
- print_parsed_results = lambda do |sequence_name, cnls_result, score|
233
- to_print = [
234
- sequence_name,
235
- cnls_result.monopartite_predicted?(score),
236
- cnls_result.bipartite_predicted?(score)
237
- ]
238
- if options[:print_scores]
239
- to_print.push cnls_result.max_monopartite_score
240
- to_print.push cnls_result.max_bipartite_score
241
- end
242
-
243
- puts to_print.join("\t")
244
- end
245
-
246
- # If
247
- if options[:use_cache]
248
- print_result_headers.call
249
- Dir.foreach('.') do |file|
250
- next if File.directory?(file) #skip '.', '..' etc.
251
-
252
- begin
253
- res = Bio::CNLS::Screenscraper.parse_html_result(File.read(file))
254
- print_parsed_results.call(
255
- file, res, options[:cutoff_score]
256
- )
257
- rescue Exception => e
258
- $stderr.puts "Failed to parse #{file}: #{e}"
259
- end
260
- end
261
- else
262
- Bio::FlatFile.foreach(ARGF) do |entry|
263
- # Sequences are automatically disqualified if they contain characters that are neither amino acids or stop codons
264
- fails = entry.seq.gsub(/[#{ACCEPTABLE_AMINO_ACID_CHARACTERS.join('')}]/,'')
265
- if fails.length > 0
266
- if options[:verbose]
267
- $stderr.puts "Found unacceptable characters in #{entry.definition}: #{fails}"
268
- end
269
- next
270
-
271
- # Sequence length must be greater than the minimum, excluding
272
- # stop codons
273
- elsif entry.seq.gsub(/\*/,'').length < QUERY_LENGTH_MINIMUM
274
- if options[:verbose]
275
- $stderr.puts "Query sequence too short (less than #{QUERY_LENGTH_MINIMUM} residues excluding stop codons): #{entry.definition}"
276
- end
277
- else
278
- # This sequence passes, run the prediction on it
279
- if options[:cache_html]
280
- res = Bio::CNLS::Screenscraper.get_raw_html_result(entry.seq)
281
- File.open("#{entry.definition}.html",'w') do |f|
282
- f.puts res
283
- end
284
- $stderr.print '.' if options[:verbose]
285
- else
286
- res = Bio::CNLS::Screenscraper.submit(entry.seq)
287
- print_result_headers.call
288
- print_parsed_results.call(entry, res, options[:cutoff_score])
289
- end
290
- end
291
- end
292
- end
293
- end
@@ -0,0 +1,2 @@
1
+ # Purely to maintain backwards-compatibility.
2
+ require 'cnls_screenscraper'
@@ -0,0 +1,293 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # A script to take a FASTA file, remove sequences that will fail, and automatically submit it to the cNLS server at http://nls-mapper.iab.keio.ac.jp/cgi-bin/NLS_Mapper_form.cgi
4
+ # Unfortunately, the fasta upload seems to fail.
5
+ # and format it so that it can be uploaded to the cNLS mapper (classical(?) nuclear localisation signal mapper).
6
+ # The fasta output file can be uploaded to
7
+ # http://nls-mapper.iab.keio.ac.jp/cgi-bin/NLS_Mapper_form.cgi
8
+
9
+ require 'bio'
10
+
11
+ module Bio
12
+ class CNLS
13
+ class Result
14
+ attr_accessor :signals
15
+
16
+ def initialize
17
+ @signals = []
18
+ end
19
+
20
+ class NLS
21
+ attr_accessor :position, :sequence, :score
22
+
23
+ # sort by score descending
24
+ def <=>(another)
25
+ -(@score<=>another.score)
26
+ end
27
+ end
28
+ class MonopartiteNLS<NLS; end
29
+ class BipartiteNLS<NLS; end
30
+
31
+ # Is this result a positive prediction or negative prediction?
32
+ def predicted?
33
+ !signals.nil? and !signals.empty?
34
+ end
35
+
36
+ def monopartite_predicted?(minimum_score=nil)
37
+ @signals.each do |s|
38
+ if s.kind_of?(MonopartiteNLS)
39
+ return true if minimum_score.nil? #if no cutoff, return true
40
+ return true if s.score >= minimum_score #otherwise apply the cutoff
41
+ end
42
+ end
43
+ return false
44
+ end
45
+
46
+ def bipartite_predicted?(minimum_score=nil)
47
+ @signals.each do |s|
48
+ if s.kind_of?(BipartiteNLS)
49
+ return true if minimum_score.nil? #if no cutoff, return true
50
+ return true if s.score >= minimum_score #otherwise apply the cutoff
51
+ end
52
+ end
53
+ return false
54
+ end
55
+
56
+ def max_monopartite_score
57
+ max = 0.0
58
+ @signals.each do |s|
59
+ if s.kind_of?(MonopartiteNLS) and s.score > max
60
+ max = s.score
61
+ end
62
+ end
63
+ return max
64
+ end
65
+
66
+ def max_bipartite_score
67
+ max = 0.0
68
+ @signals.each do |s|
69
+ if s.kind_of?(BipartiteNLS) and s.score > max
70
+ max = s.score
71
+ end
72
+ end
73
+ return max
74
+ end
75
+ end
76
+
77
+ # A class used to automatically submit results to the cNLS webserver and parse the HTML results.
78
+ class Screenscraper
79
+ require 'uri'
80
+ require 'net/http'
81
+
82
+ ACCEPTABLE_CUTOFFS = %w(2.0 3.0 4.0 5.0 6.0)
83
+
84
+ # Contact the cNLS prediction server and submit the amino acid sequence for prediction. Return a Bio::CNLS::Result object. Pause after each round for pause milliseconds, so as not to overload the server.
85
+ def self.submit(amino_acid_sequence, cut_off='3.0', seconds_pause=1)
86
+ # contact webserver and sleep
87
+ html = get_raw_html_result(amino_acid_sequence, cut_off, seconds_pause)
88
+
89
+ # Return the parsed HTML as a CNLS::Result object
90
+ return parse_html_result(html)
91
+ end
92
+
93
+ def self.get_raw_html_result(amino_acid_sequence, cut_off='3.0', seconds_pause=1)
94
+ unless ACCEPTABLE_CUTOFFS.include?(cut_off)
95
+ raise Exception, "Specified cutoff `#{cut_off}' for the cNLS screenscraper is invalid. Valid cutoffs are #{ACCEPTABLE_CUTOFFS.join(', ')}. They are strings, not floating point values."
96
+ end
97
+
98
+ # retrieve the webpage
99
+ res = Net::HTTP.post_form(URI.parse('http://nls-mapper.iab.keio.ac.jp/cgi-bin/NLS_Mapper_y.cgi'),
100
+ {'cut_off' => cut_off, 'typedseq' => amino_acid_sequence})
101
+
102
+ # if there is an error, raise it
103
+ unless res.kind_of?(Net::HTTPOK)
104
+ raise Exception, "Failed to retrieve cNLS, internet connectivity problem? Using cutoff/sequence #{cutoff}/#{amino_acid_sequence}"
105
+ end
106
+
107
+ # pause the specified number of seconds
108
+ sleep seconds_pause
109
+
110
+ return res.body
111
+ end
112
+
113
+ # Given HTML corresponding to a result, return a parse object that is more programmatically palatable.
114
+ def self.parse_html_result(html)
115
+ result = Result.new
116
+
117
+ # The mono and bi-partite regular expressions are equivalent except for the Predicted X NLS bit at the beginning, thanksfully. However, they sometimes appear to be slightly different, which is rather odd.
118
+ monopartite_regex = /Predicted monopartite NLS<\/th>\s+<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td align="center"><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><\/TR>/i
119
+ bipartite_regex = /Predicted bipartite NLS<\/th>\s+<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td align="center"><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><\/TR>/i
120
+
121
+ monopartite_no_hits = /Predicted monopartite NLS<\/th>\s*<\/tr>\s*<tr bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/tr>\s*<tr><td><strong><big><code><\/code><\/big><\/strong><\/td><td><strong><big><code><\/code><\/big><\/strong><\/td><td align="center"><strong><big><code><\/code><\/big><\/strong><\/td><\/tr>/i
122
+ bipartite_no_hits = /Predicted bipartite NLS<\/th>\s*<\/tr>\s*<tr bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/tr>\s*<tr><td><strong><big><code><\/code><\/big><\/strong><\/td><td><strong><big><code><\/code><\/big><\/strong><\/td><td align="center"><strong><big><code><\/code><\/big><\/strong><\/td><\/tr>/i
123
+ monopartite_no_hits2 = /Predicted monopartite NLS<\/th>\s*<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code \/><\/big><\/strong><\/td><td><strong><big><code \/><\/big><\/strong><\/td><td align="center"><strong><big><code \/><\/big><\/strong><\/td><\/TR>/i
124
+ bipartite_no_hits2 = /Predicted bipartite NLS<\/th>\s*<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code \/><\/big><\/strong><\/td><td><strong><big><code \/><\/big><\/strong><\/td><td align="center"><strong><big><code \/><\/big><\/strong><\/td><\/TR>/i
125
+
126
+ split_regex = /<\/code><\/big><\/strong><br.{0,2}><strong><big><code>/
127
+
128
+ # Make sure the sequence isn't too long
129
+ if html.match(/Query sequence should be < 5000 aa/)
130
+ raise Exception, "Query sequence provided was too long (> 5000 aa)"
131
+
132
+ # parse out monopartite signals
133
+ elsif matches = html.match(monopartite_regex)
134
+ positions = matches[1].split(split_regex)
135
+ seqs = matches[2].split(split_regex)
136
+ scores = matches[3].split(split_regex)
137
+
138
+ positions.each_with_index do |pos, i|
139
+ nls = Result::MonopartiteNLS.new
140
+ nls.position = pos.to_i
141
+ nls.sequence = seqs[i]
142
+ nls.score = scores[i].to_f
143
+ result.signals.push nls
144
+ end
145
+ elsif html.match(monopartite_no_hits) or html.match(monopartite_no_hits2)
146
+ # do nothing, except for not raising a parsing exception
147
+ else
148
+ raise Exception, "Could not parse HTML output returned from cNLS prediction server. In particular, looking for monopartite signals, but the whole document is likely problematic.\n#{html}"
149
+ end
150
+
151
+
152
+ # parse out the bipartite signals
153
+ if matches = html.match(bipartite_regex)
154
+ positions = matches[1].split(split_regex)
155
+ seqs = matches[2].split(split_regex)
156
+ scores = matches[3].split(split_regex)
157
+
158
+ positions.each_with_index do |pos, i|
159
+ nls = Result::BipartiteNLS.new
160
+ nls.position = pos.to_i
161
+ nls.sequence = seqs[i]
162
+ nls.score = scores[i].to_f
163
+ result.signals.push nls
164
+ end
165
+ elsif html.match(bipartite_no_hits) or html.match(bipartite_no_hits2)
166
+ # do nothing, except for not raising a parsing exception
167
+ else
168
+ raise Exception, "Could not parse HTML output returned from cNLS prediction server. In particular, looking for bipartite signals, monopartite signals seemed to be parsed OK.\n#{html}"
169
+ end
170
+
171
+ return result
172
+ end
173
+ end
174
+ end
175
+ end
176
+
177
+
178
+
179
+ if __FILE__ == $0
180
+ require 'optparse'
181
+
182
+ # When entering sequences less than this number of amino acids as a query
183
+ # it fails (if less than 10 it tells you, if less than 19 then it silently fails)
184
+ QUERY_LENGTH_MINIMUM = 19
185
+ ACCEPTABLE_AMINO_ACID_CHARACTERS = Bio::AminoAcid::Data::WEIGHT.keys.push('*')
186
+
187
+ options = {
188
+ :verbose => true,
189
+ :cache_html => false,
190
+ :use_cache => false,
191
+ :cutoff_score => nil,
192
+ :print_scores => false,
193
+ }
194
+ o = OptionParser.new do |opts|
195
+ opts.banner = [
196
+ 'Usage: bio-cnls_formatter.rb [-qhcsp] [fasta_filename]',
197
+ '\tfasta file can also be piped in on STDIN.'
198
+ ]
199
+ opts.on('-q','--quiet','Opposite of verbose. Default is not quiet (verbose is on)') do
200
+ options[:verbose] = false
201
+ end
202
+ opts.on('-h','--html','Cache HTML results in the current directory instead of parsing them. Default false.') do
203
+ options[:cache_html] = true
204
+ end
205
+ opts.on('-c','--cached','Parse the cache HTML results (as previously generated using -h/--html) in the current directory. Default false.') do
206
+ options[:use_cache] = true
207
+ end
208
+ opts.on('-s','--score SCORE','Cutoff score to be used when parsing results, between 0 and 10. Used when parsing results, not when querying the server') do |s|
209
+ options[:cutoff_score] = s.to_f
210
+ end
211
+ opts.on('-p','--print-scores','Output scores as well as true/false predictions. Default false.') do |s|
212
+ options[:print_scores] = true
213
+ end
214
+ end
215
+ o.parse!
216
+
217
+ print_result_headers = lambda do
218
+ to_print = [
219
+ 'Name',
220
+ 'Monopartite signal?',
221
+ 'Bipartite signal?'
222
+ ]
223
+ if options[:print_scores]
224
+ to_print.push 'Max monopartite score'
225
+ to_print.push 'Max bipartite score'
226
+ end
227
+
228
+ puts to_print.join("\t")
229
+ end
230
+
231
+ # Define a procedure for printing parsed results so it is more DRY
232
+ print_parsed_results = lambda do |sequence_name, cnls_result, score|
233
+ to_print = [
234
+ sequence_name,
235
+ cnls_result.monopartite_predicted?(score),
236
+ cnls_result.bipartite_predicted?(score)
237
+ ]
238
+ if options[:print_scores]
239
+ to_print.push cnls_result.max_monopartite_score
240
+ to_print.push cnls_result.max_bipartite_score
241
+ end
242
+
243
+ puts to_print.join("\t")
244
+ end
245
+
246
+ # If
247
+ if options[:use_cache]
248
+ print_result_headers.call
249
+ Dir.foreach('.') do |file|
250
+ next if File.directory?(file) #skip '.', '..' etc.
251
+
252
+ begin
253
+ res = Bio::CNLS::Screenscraper.parse_html_result(File.read(file))
254
+ print_parsed_results.call(
255
+ file, res, options[:cutoff_score]
256
+ )
257
+ rescue Exception => e
258
+ $stderr.puts "Failed to parse #{file}: #{e}"
259
+ end
260
+ end
261
+ else
262
+ Bio::FlatFile.foreach(ARGF) do |entry|
263
+ # Sequences are automatically disqualified if they contain characters that are neither amino acids or stop codons
264
+ fails = entry.seq.gsub(/[#{ACCEPTABLE_AMINO_ACID_CHARACTERS.join('')}]/,'')
265
+ if fails.length > 0
266
+ if options[:verbose]
267
+ $stderr.puts "Found unacceptable characters in #{entry.definition}: #{fails}"
268
+ end
269
+ next
270
+
271
+ # Sequence length must be greater than the minimum, excluding
272
+ # stop codons
273
+ elsif entry.seq.gsub(/\*/,'').length < QUERY_LENGTH_MINIMUM
274
+ if options[:verbose]
275
+ $stderr.puts "Query sequence too short (less than #{QUERY_LENGTH_MINIMUM} residues excluding stop codons): #{entry.definition}"
276
+ end
277
+ else
278
+ # This sequence passes, run the prediction on it
279
+ if options[:cache_html]
280
+ res = Bio::CNLS::Screenscraper.get_raw_html_result(entry.seq)
281
+ File.open("#{entry.definition}.html",'w') do |f|
282
+ f.puts res
283
+ end
284
+ $stderr.print '.' if options[:verbose]
285
+ else
286
+ res = Bio::CNLS::Screenscraper.submit(entry.seq)
287
+ print_result_headers.call
288
+ print_parsed_results.call(entry, res, options[:cutoff_score])
289
+ end
290
+ end
291
+ end
292
+ end
293
+ end
@@ -1,8 +1,7 @@
1
1
  require 'helper'
2
- require 'bio-cnls_screenscraper'
3
2
 
4
3
  class TestBioCnlsScreenscraper < Test::Unit::TestCase
5
- @@data_dir = File.join(File.dirname(__FILE__),['data'])
4
+ @@data_dir = File.join(File.dirname(__FILE__),'..',['data'])
6
5
 
7
6
  should "correctly parse hit results with no hits" do
8
7
  html = File.open(File.join(@@data_dir,'nohits.html')).read
metadata CHANGED
@@ -1,157 +1,151 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: bio-cnls_screenscraper
3
- version: !ruby/object:Gem::Version
4
- hash: 27
5
- prerelease: false
6
- segments:
7
- - 0
8
- - 1
9
- - 0
10
- version: 0.1.0
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ prerelease:
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Ben J. Woodcroft
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2011-02-21 00:00:00 +11:00
19
- default_executable:
20
- dependencies:
21
- - !ruby/object:Gem::Dependency
22
- prerelease: false
12
+ date: 2012-05-06 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
23
15
  name: shoulda
24
- type: :development
25
- version_requirements: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
26
17
  none: false
27
- requirements:
28
- - - ">="
29
- - !ruby/object:Gem::Version
30
- hash: 3
31
- segments:
32
- - 0
33
- version: "0"
34
- requirement: *id001
35
- - !ruby/object:Gem::Dependency
36
- prerelease: false
37
- name: bundler
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
38
22
  type: :development
39
- version_requirements: &id002 !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
40
25
  none: false
41
- requirements:
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rdoc
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
42
35
  - - ~>
43
- - !ruby/object:Gem::Version
44
- hash: 23
45
- segments:
46
- - 1
47
- - 0
48
- - 0
49
- version: 1.0.0
50
- requirement: *id002
51
- - !ruby/object:Gem::Dependency
36
+ - !ruby/object:Gem::Version
37
+ version: '3.12'
38
+ type: :development
52
39
  prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: '3.12'
46
+ - !ruby/object:Gem::Dependency
53
47
  name: jeweler
54
- type: :development
55
- version_requirements: &id003 !ruby/object:Gem::Requirement
48
+ requirement: !ruby/object:Gem::Requirement
56
49
  none: false
57
- requirements:
50
+ requirements:
58
51
  - - ~>
59
- - !ruby/object:Gem::Version
60
- hash: 7
61
- segments:
62
- - 1
63
- - 5
64
- - 2
65
- version: 1.5.2
66
- requirement: *id003
67
- - !ruby/object:Gem::Dependency
68
- prerelease: false
69
- name: rcov
52
+ - !ruby/object:Gem::Version
53
+ version: 1.8.3
70
54
  type: :development
71
- version_requirements: &id004 !ruby/object:Gem::Requirement
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 1.8.3
62
+ - !ruby/object:Gem::Dependency
63
+ name: bundler
64
+ requirement: !ruby/object:Gem::Requirement
72
65
  none: false
73
- requirements:
74
- - - ">="
75
- - !ruby/object:Gem::Version
76
- hash: 3
77
- segments:
78
- - 0
79
- version: "0"
80
- requirement: *id004
81
- - !ruby/object:Gem::Dependency
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: 1.0.21
70
+ type: :development
82
71
  prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: 1.0.21
78
+ - !ruby/object:Gem::Dependency
83
79
  name: bio
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: 1.4.2
84
86
  type: :development
85
- version_requirements: &id005 !ruby/object:Gem::Requirement
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
86
89
  none: false
87
- requirements:
88
- - - ">="
89
- - !ruby/object:Gem::Version
90
- hash: 5
91
- segments:
92
- - 1
93
- - 4
94
- - 1
95
- version: 1.4.1
96
- requirement: *id005
97
- description: Programmatic interface to the cNLS nuclear localisation signal prediction software
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: 1.4.2
94
+ description: Programmatic interface to the cNLS nuclear localisation signal prediction
95
+ software
98
96
  email: donttrustben@gmail.com
99
- executables: []
100
-
97
+ executables:
98
+ - bio-cnls_screenscraper
101
99
  extensions: []
102
-
103
- extra_rdoc_files:
100
+ extra_rdoc_files:
104
101
  - LICENSE.txt
105
102
  - README.rdoc
106
- files:
103
+ files:
107
104
  - .document
105
+ - .travis.yml
108
106
  - Gemfile
109
107
  - LICENSE.txt
110
108
  - README.rdoc
111
109
  - Rakefile
112
110
  - VERSION
111
+ - bin/bio-cnls_screenscraper
113
112
  - lib/bio-cnls_screenscraper.rb
113
+ - lib/bio/cnls_screenscraper.rb
114
+ - lib/bio/cnls_screenscraper/cnls_screenscraper.rb
115
+ - test/bio/test_cnls_screenscraper.rb
114
116
  - test/data/badCharacters.html
115
117
  - test/data/bipartiteHitOnly.html
116
118
  - test/data/lessThan10Fail.html
117
119
  - test/data/monopartiteHitOnly.html
118
120
  - test/data/nohits.html
119
121
  - test/helper.rb
120
- - test/test_bio-cnls_screenscraper.rb
121
- has_rdoc: true
122
122
  homepage: http://github.com/wwood/bioruby-cnls_screenscraper
123
- licenses:
123
+ licenses:
124
124
  - MIT
125
125
  post_install_message:
126
126
  rdoc_options: []
127
-
128
- require_paths:
127
+ require_paths:
129
128
  - lib
130
- required_ruby_version: !ruby/object:Gem::Requirement
129
+ required_ruby_version: !ruby/object:Gem::Requirement
131
130
  none: false
132
- requirements:
133
- - - ">="
134
- - !ruby/object:Gem::Version
135
- hash: 3
136
- segments:
131
+ requirements:
132
+ - - ! '>='
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ segments:
137
136
  - 0
138
- version: "0"
139
- required_rubygems_version: !ruby/object:Gem::Requirement
137
+ hash: 108166141
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
140
139
  none: false
141
- requirements:
142
- - - ">="
143
- - !ruby/object:Gem::Version
144
- hash: 3
145
- segments:
146
- - 0
147
- version: "0"
140
+ requirements:
141
+ - - ! '>='
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
148
144
  requirements: []
149
-
150
145
  rubyforge_project:
151
- rubygems_version: 1.3.7
146
+ rubygems_version: 1.8.21
152
147
  signing_key:
153
148
  specification_version: 3
154
- summary: Programmatic interface to the cNLS nuclear localisation signal prediction software
155
- test_files:
156
- - test/helper.rb
157
- - test/test_bio-cnls_screenscraper.rb
149
+ summary: Programmatic interface to the cNLS nuclear localisation signal prediction
150
+ software
151
+ test_files: []