unipept 0.4.2 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: eadba0bac0ca2f5d8dbbbceffe168f26c621cd86
4
- data.tar.gz: 5115330bf9a71bfabfa9ac78c852769c9299a51d
3
+ metadata.gz: 6f20b5baa54b98b05f96b2ed54cdcc4bc6772021
4
+ data.tar.gz: 0fdb2fc4944c5bfbe8ad76395531b5414d054f22
5
5
  SHA512:
6
- metadata.gz: c03d75460bbb446335ce708e3602b5edb327434f4e427deb6097231be446e9ecb004bf0f31bcc7034026e685690eb900128ba5b7a6148cc269ea6407397d2ce2
7
- data.tar.gz: 8d83ec62c72179ec629777c54dee5f8b41db8ecedcdc3ddd9b5ec82b419c7e99592bc5ab54cd83b85f0a307d8156c7f5a85f4d12168b4036b200dd7aaadbdc75
6
+ metadata.gz: 182a1babb95902e0cb74d08494748940d02fb77f0c502086eec19809ed2c4b2b8af5a9f190041efb8ac7e30614230971bd5e2e53aec6770587888543b128eafc
7
+ data.tar.gz: fde9797bcdf7be09258b8010a39dd91b686732c2bb647b1488f51bac96a9c66efddf1061ab1919b35c90f3eb76334f67b84e8a707c26a7dcc630c3e1020053a0
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.2
1
+ 0.5.0
data/bin/peptfilter CHANGED
@@ -16,6 +16,11 @@ root_cmd = Cri::Command.new_basic_root.modify do
16
16
  lacks = opts.fetch(:lacks, "").chars.to_a
17
17
  contains = opts.fetch(:contains, "").chars.to_a
18
18
  $stdin.each_line do |pept|
19
+ # FASTA headers
20
+ if pept.start_with? '>'
21
+ puts pept
22
+ next
23
+ end
19
24
  pept = pept.chomp
20
25
  length_ok = pept.length >= minlen && pept.length <= maxlen
21
26
  lacks_ok = (pept.chars.to_a & lacks).size == 0
data/bin/prot2pept CHANGED
@@ -8,8 +8,29 @@ root_cmd = Cri::Command.new_basic_root.modify do
8
8
  required :p, :pattern, "cleavage pattern to split input protein (default: ([KR])([^P]))"
9
9
  run do |opts, args, cmd|
10
10
  pattern = opts.fetch(:pattern, "([KR])([^P])")
11
- $stdin.each_line do |prot|
12
- puts prot.gsub(/#{pattern}/,"\\1\n\\2").gsub(/#{pattern}/, "\\1\n\\2").split("\n").reject(&:empty?)
11
+ # decide if we have FASTA input
12
+ fasta_header = gets
13
+ if fasta_header.start_with? '>'
14
+ # fasta input, need to join lines
15
+ while !$stdin.eof?
16
+ prot = ""
17
+ # Sometimes you just got to accept this weird and ugly code
18
+ until $stdin.eof? || (line = gets).start_with?('>')
19
+ prot += line.chomp
20
+ end
21
+ puts fasta_header
22
+ puts prot.gsub(/#{pattern}/,"\\1\n\\2").gsub(/#{pattern}/, "\\1\n\\2").split("\n").reject(&:empty?)
23
+
24
+ fasta_header = line
25
+ end
26
+ else
27
+ # handle our already read line
28
+ puts fasta_header.gsub(/#{pattern}/,"\\1\n\\2").gsub(/#{pattern}/, "\\1\n\\2").split("\n").reject(&:empty?)
29
+
30
+ # we no longer have to join lines as input is now more sane
31
+ $stdin.each_line do |prot|
32
+ puts prot.gsub(/#{pattern}/,"\\1\n\\2").gsub(/#{pattern}/, "\\1\n\\2").split("\n").reject(&:empty?)
33
+ end
13
34
  end
14
35
  end
15
36
  end
data/bin/unipept CHANGED
@@ -10,213 +10,51 @@ require_relative '../lib/unipept'
10
10
 
11
11
  Signal.trap("PIPE", "EXIT")
12
12
  Signal.trap("INT", "EXIT")
13
- class ApiRunner < Cri::CommandRunner
14
-
15
- def initialize(args, opts, cmd)
16
- super
17
- @configuration = Unipept::Configuration.new
18
- host = @configuration['host']
19
- if host.nil? || host.empty?
20
- puts "WARNING: no host has been set, you can set the host with `unipept config host http://localhost:3000/`"
21
- exit 1
22
- end
23
- if !host.start_with? "http://"
24
- host = "http://#{host}"
25
- end
26
13
 
27
- @url = "#{host}/api/v1/#{mapping[cmd.name]}.json"
28
- @message_url = "#{host}/api/v1/messages.json"
29
- end
14
+ module Unipept
15
+ class Taxa2lca < ApiRunner
30
16
 
31
- def mapping
32
- {'pept2taxa' => 'pept2taxa', 'pept2lca' => 'pept2lca'}
33
- end
34
-
35
- def input_iterator
36
- if options[:input]
37
- File.readlines(options[:input]).each
38
- else
39
- STDIN.each_line
17
+ def mapping
18
+ {"taxa2lca" => "taxa2lca"}
40
19
  end
41
- end
42
20
 
43
- def batch_size
44
- 100
45
- end
21
+ def peptide_iterator(peptides, &block)
22
+ block.call(peptides.to_a, 0)
23
+ end
46
24
 
47
- def url_options(sub_part)
48
- filter = options[:select] ? options[:select] : []
49
- if filter.empty?
50
- names = true
51
- else
52
- names = filter.any? {|f| /.*name.*/.match f}
25
+ def batch_size
26
+ raise "NOT NEEDED FOR TAXA2LCA"
53
27
  end
54
- {:input => sub_part,
55
- :equate_il => options[:equate],
56
- :extra => options[:extra],
57
- :names => names,
58
- }
28
+
59
29
  end
60
30
 
61
- def get_server_message
62
- return if options[:quiet]
63
- return unless STDOUT.tty?
64
- last_fetched = @configuration['last_fetch_date']
65
- if last_fetched.nil? || (last_fetched + 60 * 60 * 24) < Time.now
66
- version = File.read(File.join(File.dirname(__FILE__), "..", "VERSION"))
67
- puts Typhoeus.get(@message_url, params: {version: version}).body
31
+ class Pept2prot < ApiRunner
68
32
 
69
- @configuration['last_fetch_date'] = Time.now
70
- @configuration.save
33
+ def mapping
34
+ {"pept2prot" => "pept2prot"}
71
35
  end
72
- end
73
36
 
74
- def run
75
- get_server_message
76
-
77
- formatter = Unipept::Formatter.new_for_format(options[:format])
78
- peptides = input_iterator
79
-
80
- filter_list = options[:select] ? options[:select] : []
81
- filter_list = filter_list.map {|f| glob_to_regex(f) }
82
- output = STDOUT.tty? ? STDOUT : STDERR
83
-
84
- batch_order = Unipept::BatchOrder.new
85
-
86
- printed_header = false
87
- result = []
88
-
89
- hydra = Typhoeus::Hydra.new(max_concurrency: 20)
90
- num_req = 0
91
-
92
- peptide_iterator(peptides) do |sub_division, i|
93
- request = Typhoeus::Request.new(
94
- @url,
95
- method: :post,
96
- body: url_options(sub_division),
97
- accept_encoding: "gzip"
98
- )
99
- request.on_complete do |resp|
100
- if resp.timed_out?
101
- $stderr.puts "request timed out, continuing anyway, but results might be incomplete"
102
- else
103
- if resp.success?
104
- # if JSON parsing goes wrong
105
- sub_result = JSON[resp.response_body] rescue []
106
- sub_result = [sub_result] if not sub_result.kind_of? Array
107
-
108
- sub_result.map! {|r| r.select! {|k,v| filter_list.any? {|f| f.match k } } } if ! filter_list.empty?
109
-
110
- if options[:xml]
111
- result << sub_result
112
- end
113
-
114
- # wait till it's our turn to write
115
- batch_order.wait(i) do
116
- if ! sub_result.empty?
117
- if ! printed_header
118
- write_to_output formatter.header(sub_result)
119
- printed_header = true
120
- end
121
- write_to_output formatter.format(sub_result)
122
- end
123
- end
124
- else
125
- path = File.expand_path(File.join(Dir.home, "unipept.log"))
126
- File.open(path, "w") do |f|
127
- f.write resp.response_body
128
- end
129
- $stderr.puts "API request failed! log can be found in #{path}"
37
+ def download_xml(result)
38
+ if options[:xml]
39
+ FileUtils.mkdir_p(options[:xml])
40
+ result.first.each do |prot|
41
+ File.open(options[:xml] + "/#{prot['uniprot_id']}.xml", "wb") do |f|
42
+ f.write Typhoeus.get("http://www.uniprot.org/uniprot/#{prot['uniprot_id']}.xml").response_body
130
43
  end
131
44
  end
132
45
  end
133
- hydra.queue request
134
-
135
- num_req += 1
136
- if num_req % 200 == 0
137
- hydra.run
138
- end
139
-
140
46
  end
141
47
 
142
- hydra.run
143
-
144
- begin
145
- download_xml(result)
146
- rescue
147
- STDERR.puts "Something went wrong while downloading xml information! please check the output"
48
+ def batch_size
49
+ 10
148
50
  end
149
-
150
- end
151
-
152
- def write_to_output(string)
153
- if options[:output]
154
- File.open(options[:output], 'a') do |f|
155
- f.write string
156
- end
157
- else
158
- puts string
159
- end
160
- end
161
-
162
-
163
- def download_xml(result)
164
- if options[:xml]
165
- File.open(options[:xml] + ".xml", "wb") do |f|
166
- f.write Typhoeus.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=#{result.first.map{|h| h['taxon_id'] }.join(",")}&retmode=xml").response_body
167
- end
168
- end
169
- end
170
-
171
- def peptide_iterator(peptides, &block)
172
- peptides.each_slice(batch_size).with_index(&block)
173
- end
174
-
175
- private
176
-
177
- def glob_to_regex(glob_string)
178
- # only implement * -> . for now
179
- Regexp.new glob_string.gsub("*", ".*")
180
- end
181
- end
182
-
183
- class Taxa2lca < ApiRunner
184
-
185
- def mapping
186
- {"taxa2lca" => "taxa2lca"}
187
- end
188
-
189
- def peptide_iterator(peptides, &block)
190
- block.call(peptides.to_a, 0)
191
- end
192
-
193
- def batch_size
194
- raise "NOT NEEDED FOR TAXA2LCA"
195
51
  end
196
52
 
197
- end
198
-
199
- class Pept2prot < ApiRunner
200
-
201
- def mapping
202
- {"pept2prot" => "pept2prot"}
203
- end
204
-
205
- def download_xml(result)
206
- if options[:xml]
207
- FileUtils.mkdir_p(options[:xml])
208
- result.first.each do |prot|
209
- File.open(options[:xml] + "/#{prot['uniprot_id']}.xml", "wb") do |f|
210
- f.write Typhoeus.get("http://www.uniprot.org/uniprot/#{prot['uniprot_id']}.xml").response_body
211
- end
212
- end
53
+ class Taxonomy < ApiRunner
54
+ def mapping
55
+ {"taxonomy" => "taxonomy"}
213
56
  end
214
57
  end
215
-
216
- def batch_size
217
- 10
218
- end
219
-
220
58
  end
221
59
 
222
60
  root_cmd = Cri::Command.new_basic_root.modify do
@@ -227,6 +65,9 @@ root_cmd = Cri::Command.new_basic_root.modify do
227
65
  option :o, :output, "output file", :argument => :required
228
66
  option :f, :format, "output format (available: #{Unipept::Formatter.available.join "," }) (default: #{Unipept::Formatter.default})", :argument => :required
229
67
 
68
+ # Configuration options
69
+ option nil, "config-host", "Override host setting", argument: :required
70
+
230
71
  run do |opts, args, cmd|
231
72
  if opts[:version]
232
73
  puts File.read(File.join(File.dirname(__FILE__), "..", "VERSION"))
@@ -260,7 +101,7 @@ root_cmd.define_command('pept2taxa') do
260
101
  option :a, :extra, "Show full lineage"
261
102
  option :x, :xml, "Download taxonomy from NCBI as xml (specify output filename)", :argument => :required
262
103
 
263
- runner ApiRunner
104
+ runner Unipept::ApiRunner
264
105
  end
265
106
 
266
107
  root_cmd.define_command('pept2lca') do
@@ -273,7 +114,7 @@ root_cmd.define_command('pept2lca') do
273
114
  option :s, :select, "select the attributes", :argument => :required, :multiple => true
274
115
  option :a, :extra, "Show full lineage"
275
116
 
276
- runner ApiRunner
117
+ runner Unipept::ApiRunner
277
118
  end
278
119
 
279
120
  root_cmd.define_command('taxa2lca') do
@@ -285,7 +126,7 @@ root_cmd.define_command('taxa2lca') do
285
126
  option :s, :select, "select the attributes", :argument => :required, :multiple => true
286
127
  option :a, :extra, "Show full lineage"
287
128
 
288
- runner Taxa2lca
129
+ runner Unipept::Taxa2lca
289
130
  end
290
131
 
291
132
  root_cmd.define_command('pept2prot') do
@@ -299,7 +140,19 @@ root_cmd.define_command('pept2prot') do
299
140
  option :x, :xml, "download uniprot record in specified directory", :argument => :required
300
141
  flag :a, :extra, "include all information. WARNING: will take much longer!"
301
142
 
302
- runner Pept2prot
143
+ runner Unipept::Pept2prot
144
+ end
145
+
146
+ root_cmd.define_command('taxonomy') do
147
+ usage 'taxonomy [options]'
148
+ aliases :tax
149
+ summary 'Give NCBI taxonomy information on given input taxon ids'
150
+ description 'Returns information for each input taxon id'
151
+
152
+ option :s, :select, "select the attributes", :argument => :required, :multiple => true
153
+ flag :a, :extra, "include all information. WARNING: will take much longer!"
154
+
155
+ runner Unipept::Taxonomy
303
156
  end
304
157
 
305
158
  root_cmd.run(ARGV)
@@ -0,0 +1,217 @@
1
+ module Unipept
2
+ class ApiRunner < Cri::CommandRunner
3
+
4
+ def initialize(args, opts, cmd)
5
+ super
6
+ @configuration = Unipept::Configuration.new
7
+
8
+ set_configuration
9
+
10
+ @url = "#{@host}/api/v1/#{mapping[cmd.name]}.json"
11
+ @message_url = "#{@host}/api/v1/messages.json"
12
+ end
13
+
14
+ def set_configuration
15
+ # find host in opts first
16
+ if options[:'config-host']
17
+ host = options[:'config-host']
18
+ else
19
+ host = @configuration['host']
20
+ end
21
+
22
+ # No host has been set?
23
+ if host.nil? || host.empty?
24
+ puts "WARNING: no host has been set, you can set the host with `unipept config host http://localhost:3000/`"
25
+ exit 1
26
+ end
27
+ if !host.start_with? "http://"
28
+ host = "http://#{host}"
29
+ end
30
+
31
+ @host = host
32
+ end
33
+
34
+ def mapping
35
+ {'pept2taxa' => 'pept2taxa', 'pept2lca' => 'pept2lca'}
36
+ end
37
+
38
+ def input_iterator
39
+ # Argument over file input over stdin
40
+ if !arguments.empty?
41
+ arguments.each
42
+ else
43
+ if options[:input]
44
+ IO.foreach(options[:input])
45
+ else
46
+ STDIN.each_line
47
+ end
48
+ end
49
+ end
50
+
51
+ def batch_size
52
+ 100
53
+ end
54
+
55
+ def url_options(sub_part)
56
+ filter = options[:select] ? options[:select] : []
57
+ if filter.empty?
58
+ names = true
59
+ else
60
+ names = filter.any? {|f| /.*name.*/.match f}
61
+ end
62
+ {:input => sub_part,
63
+ :equate_il => options[:equate],
64
+ :extra => options[:extra],
65
+ :names => names,
66
+ }
67
+ end
68
+
69
+ def get_server_message
70
+ return if options[:quiet]
71
+ return unless STDOUT.tty?
72
+ last_fetched = @configuration['last_fetch_date']
73
+ if last_fetched.nil? || (last_fetched + 60 * 60 * 24) < Time.now
74
+ version = File.read(File.join(File.dirname(__FILE__), "..", "VERSION"))
75
+ puts Typhoeus.get(@message_url, params: {version: version}).body
76
+
77
+ @configuration['last_fetch_date'] = Time.now
78
+ @configuration.save
79
+ end
80
+ end
81
+
82
+ def run
83
+ get_server_message
84
+
85
+ formatter = Unipept::Formatter.new_for_format(options[:format])
86
+ peptides = input_iterator
87
+
88
+ filter_list = options[:select] ? options[:select] : []
89
+ filter_list = filter_list.map {|f| glob_to_regex(f) }
90
+ output = STDOUT.tty? ? STDOUT : STDERR
91
+
92
+ batch_order = Unipept::BatchOrder.new
93
+
94
+ printed_header = false
95
+ result = []
96
+
97
+ hydra = Typhoeus::Hydra.new(max_concurrency: 10)
98
+ num_req = 0
99
+
100
+ peptide_iterator(peptides) do |sub_division, i, fasta_mapper|
101
+ request = Typhoeus::Request.new(
102
+ @url,
103
+ method: :post,
104
+ body: url_options(sub_division),
105
+ accept_encoding: "gzip"
106
+ )
107
+ request.on_complete do |resp|
108
+ if resp.timed_out?
109
+ $stderr.puts "request timed out, continuing anyway, but results might be incomplete"
110
+ else
111
+ if resp.success?
112
+ # if JSON parsing goes wrong
113
+ sub_result = JSON[resp.response_body] rescue []
114
+ sub_result = [sub_result] if not sub_result.kind_of? Array
115
+
116
+ sub_result.map! {|r| r.select! {|k,v| filter_list.any? {|f| f.match k } } } if ! filter_list.empty?
117
+
118
+ if options[:xml]
119
+ result << sub_result
120
+ end
121
+
122
+ # wait till it's our turn to write
123
+ batch_order.wait(i) do
124
+ if ! sub_result.empty?
125
+ if ! printed_header
126
+ write_to_output formatter.header(sub_result, fasta_mapper)
127
+ printed_header = true
128
+ end
129
+ write_to_output formatter.format(sub_result, fasta_mapper)
130
+ end
131
+ end
132
+ else
133
+ save_error(resp.response_body)
134
+ end
135
+ end
136
+ end
137
+ hydra.queue request
138
+
139
+ num_req += 1
140
+ if num_req % 200 == 0
141
+ hydra.run
142
+ end
143
+
144
+ end
145
+
146
+ hydra.run
147
+
148
+ begin
149
+ download_xml(result)
150
+ rescue
151
+ STDERR.puts "Something went wrong while downloading xml information! please check the output"
152
+ end
153
+
154
+ end
155
+
156
+ def save_error(message)
157
+ path = File.expand_path(File.join(Dir.home, ".unipept", "unipept-#{Time.now.strftime("%F-%T")}.log"))
158
+ FileUtils.mkdir_p File.dirname(path)
159
+ File.open(path, "w") do |f|
160
+ f.write message
161
+ end
162
+ $stderr.puts "API request failed! log can be found in #{path}"
163
+ end
164
+
165
+ def write_to_output(string)
166
+ if options[:output]
167
+ File.open(options[:output], 'a') do |f|
168
+ f.write string
169
+ end
170
+ else
171
+ puts string
172
+ end
173
+ end
174
+
175
+
176
+ def download_xml(result)
177
+ if options[:xml]
178
+ File.open(options[:xml] + ".xml", "wb") do |f|
179
+ f.write Typhoeus.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=#{result.first.map{|h| h['taxon_id'] }.join(",")}&retmode=xml").response_body
180
+ end
181
+ end
182
+ end
183
+
184
+ def peptide_iterator(peptides, &block)
185
+ first = peptides.first
186
+ if first.start_with? '>'
187
+ # FASTA MODE ENGAGED
188
+ fasta_header = first
189
+ peptides.each_slice(batch_size).with_index do |sub,i|
190
+ fasta_mapper = {}
191
+ sub.map! {|s| s.chomp}
192
+ j = 0
193
+ while j < sub.size
194
+ if sub[j].start_with? '>'
195
+ fasta_header = sub[j]
196
+ else
197
+ fasta_mapper[sub[j]] = fasta_header
198
+ end
199
+ j += 1
200
+ end
201
+ sub -= fasta_mapper.values.uniq
202
+ block.call(sub, i, fasta_mapper)
203
+ end
204
+
205
+ else
206
+ peptides.each_slice(batch_size).with_index(&block)
207
+ end
208
+ end
209
+
210
+ private
211
+
212
+ def glob_to_regex(glob_string)
213
+ # only implement * -> . for now
214
+ Regexp.new glob_string.gsub("*", ".*")
215
+ end
216
+ end
217
+ end
@@ -30,7 +30,7 @@ module Unipept
30
30
  end
31
31
 
32
32
  # JSON formatted data goes in, something other comes out
33
- def format(data)
33
+ def format(data, fasta_mapper = nil)
34
34
  data
35
35
  end
36
36
  end
@@ -49,23 +49,40 @@ module Unipept
49
49
 
50
50
  register :csv
51
51
 
52
- def header(data)
52
+ def header(data, fasta_mapper = nil)
53
53
  CSV.generate do |csv|
54
54
  first = data.first
55
55
  if first.kind_of? Array
56
56
  first = first.first
57
57
  end
58
- csv << first.keys.map(&:to_s) if first
58
+ if fasta_mapper
59
+ csv << (['fasta_header'] + first.keys).map(&:to_s) if first
60
+ else
61
+ csv << first.keys.map(&:to_s) if first
62
+ end
63
+
59
64
  end
60
65
  end
61
66
 
62
- def format(data)
67
+ def format(data, fasta_mapper = nil)
63
68
  CSV.generate do |csv|
64
69
  data.each do |o|
65
70
  if o.kind_of? Array
66
- o.each {|h| csv << h.values.map { |v| v == "" ? nil : v }}
71
+ o.each do |h|
72
+ if fasta_mapper
73
+ extra_key = [fasta_mapper[h.values.first]]
74
+ csv << (extra_key + h.values).map { |v| v == "" ? nil : v }
75
+ else
76
+ csv << h.values.map { |v| v == "" ? nil : v }
77
+ end
78
+ end
67
79
  else
68
- csv << o.values.map { |v| v == "" ? nil : v }
80
+ if fasta_mapper
81
+ extra_key = [fasta_mapper[o.values.first]]
82
+ csv << (extra_key + o.values).map { |v| v == "" ? nil : v }
83
+ else
84
+ csv << o.values.map { |v| v == "" ? nil : v }
85
+ end
69
86
  end
70
87
  end
71
88
  end
data/lib/unipept.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require_relative 'unipept/formatters'
2
2
  require_relative 'unipept/configuration'
3
3
  require_relative 'unipept/batch_order'
4
+ require_relative 'unipept/api_runner'
4
5
 
5
6
  module Unipept
6
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unipept
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Toon Willems
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-14 00:00:00.000000000 Z
11
+ date: 2014-05-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: shoulda
@@ -131,6 +131,7 @@ files:
131
131
  - bin/unipept
132
132
  - bin/uniprot
133
133
  - lib/unipept.rb
134
+ - lib/unipept/api_runner.rb
134
135
  - lib/unipept/batch_order.rb
135
136
  - lib/unipept/configuration.rb
136
137
  - lib/unipept/formatters.rb