unipept 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: eadba0bac0ca2f5d8dbbbceffe168f26c621cd86
4
- data.tar.gz: 5115330bf9a71bfabfa9ac78c852769c9299a51d
3
+ metadata.gz: 6f20b5baa54b98b05f96b2ed54cdcc4bc6772021
4
+ data.tar.gz: 0fdb2fc4944c5bfbe8ad76395531b5414d054f22
5
5
  SHA512:
6
- metadata.gz: c03d75460bbb446335ce708e3602b5edb327434f4e427deb6097231be446e9ecb004bf0f31bcc7034026e685690eb900128ba5b7a6148cc269ea6407397d2ce2
7
- data.tar.gz: 8d83ec62c72179ec629777c54dee5f8b41db8ecedcdc3ddd9b5ec82b419c7e99592bc5ab54cd83b85f0a307d8156c7f5a85f4d12168b4036b200dd7aaadbdc75
6
+ metadata.gz: 182a1babb95902e0cb74d08494748940d02fb77f0c502086eec19809ed2c4b2b8af5a9f190041efb8ac7e30614230971bd5e2e53aec6770587888543b128eafc
7
+ data.tar.gz: fde9797bcdf7be09258b8010a39dd91b686732c2bb647b1488f51bac96a9c66efddf1061ab1919b35c90f3eb76334f67b84e8a707c26a7dcc630c3e1020053a0
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.2
1
+ 0.5.0
data/bin/peptfilter CHANGED
@@ -16,6 +16,11 @@ root_cmd = Cri::Command.new_basic_root.modify do
16
16
  lacks = opts.fetch(:lacks, "").chars.to_a
17
17
  contains = opts.fetch(:contains, "").chars.to_a
18
18
  $stdin.each_line do |pept|
19
+ # FASTA headers
20
+ if pept.start_with? '>'
21
+ puts pept
22
+ next
23
+ end
19
24
  pept = pept.chomp
20
25
  length_ok = pept.length >= minlen && pept.length <= maxlen
21
26
  lacks_ok = (pept.chars.to_a & lacks).size == 0
data/bin/prot2pept CHANGED
@@ -8,8 +8,29 @@ root_cmd = Cri::Command.new_basic_root.modify do
8
8
  required :p, :pattern, "cleavage pattern to split input protein (default: ([KR])([^P]))"
9
9
  run do |opts, args, cmd|
10
10
  pattern = opts.fetch(:pattern, "([KR])([^P])")
11
- $stdin.each_line do |prot|
12
- puts prot.gsub(/#{pattern}/,"\\1\n\\2").gsub(/#{pattern}/, "\\1\n\\2").split("\n").reject(&:empty?)
11
+ # decide if we have FASTA input
12
+ fasta_header = gets
13
+ if fasta_header.start_with? '>'
14
+ # fasta input, need to join lines
15
+ while !$stdin.eof?
16
+ prot = ""
17
+ # Sometimes you just got to accept this weird and ugly code
18
+ until $stdin.eof? || (line = gets).start_with?('>')
19
+ prot += line.chomp
20
+ end
21
+ puts fasta_header
22
+ puts prot.gsub(/#{pattern}/,"\\1\n\\2").gsub(/#{pattern}/, "\\1\n\\2").split("\n").reject(&:empty?)
23
+
24
+ fasta_header = line
25
+ end
26
+ else
27
+ # handle our already read line
28
+ puts fasta_header.gsub(/#{pattern}/,"\\1\n\\2").gsub(/#{pattern}/, "\\1\n\\2").split("\n").reject(&:empty?)
29
+
30
+ # we no longer have to join lines as input is now more sane
31
+ $stdin.each_line do |prot|
32
+ puts prot.gsub(/#{pattern}/,"\\1\n\\2").gsub(/#{pattern}/, "\\1\n\\2").split("\n").reject(&:empty?)
33
+ end
13
34
  end
14
35
  end
15
36
  end
data/bin/unipept CHANGED
@@ -10,213 +10,51 @@ require_relative '../lib/unipept'
10
10
 
11
11
  Signal.trap("PIPE", "EXIT")
12
12
  Signal.trap("INT", "EXIT")
13
- class ApiRunner < Cri::CommandRunner
14
-
15
- def initialize(args, opts, cmd)
16
- super
17
- @configuration = Unipept::Configuration.new
18
- host = @configuration['host']
19
- if host.nil? || host.empty?
20
- puts "WARNING: no host has been set, you can set the host with `unipept config host http://localhost:3000/`"
21
- exit 1
22
- end
23
- if !host.start_with? "http://"
24
- host = "http://#{host}"
25
- end
26
13
 
27
- @url = "#{host}/api/v1/#{mapping[cmd.name]}.json"
28
- @message_url = "#{host}/api/v1/messages.json"
29
- end
14
+ module Unipept
15
+ class Taxa2lca < ApiRunner
30
16
 
31
- def mapping
32
- {'pept2taxa' => 'pept2taxa', 'pept2lca' => 'pept2lca'}
33
- end
34
-
35
- def input_iterator
36
- if options[:input]
37
- File.readlines(options[:input]).each
38
- else
39
- STDIN.each_line
17
+ def mapping
18
+ {"taxa2lca" => "taxa2lca"}
40
19
  end
41
- end
42
20
 
43
- def batch_size
44
- 100
45
- end
21
+ def peptide_iterator(peptides, &block)
22
+ block.call(peptides.to_a, 0)
23
+ end
46
24
 
47
- def url_options(sub_part)
48
- filter = options[:select] ? options[:select] : []
49
- if filter.empty?
50
- names = true
51
- else
52
- names = filter.any? {|f| /.*name.*/.match f}
25
+ def batch_size
26
+ raise "NOT NEEDED FOR TAXA2LCA"
53
27
  end
54
- {:input => sub_part,
55
- :equate_il => options[:equate],
56
- :extra => options[:extra],
57
- :names => names,
58
- }
28
+
59
29
  end
60
30
 
61
- def get_server_message
62
- return if options[:quiet]
63
- return unless STDOUT.tty?
64
- last_fetched = @configuration['last_fetch_date']
65
- if last_fetched.nil? || (last_fetched + 60 * 60 * 24) < Time.now
66
- version = File.read(File.join(File.dirname(__FILE__), "..", "VERSION"))
67
- puts Typhoeus.get(@message_url, params: {version: version}).body
31
+ class Pept2prot < ApiRunner
68
32
 
69
- @configuration['last_fetch_date'] = Time.now
70
- @configuration.save
33
+ def mapping
34
+ {"pept2prot" => "pept2prot"}
71
35
  end
72
- end
73
36
 
74
- def run
75
- get_server_message
76
-
77
- formatter = Unipept::Formatter.new_for_format(options[:format])
78
- peptides = input_iterator
79
-
80
- filter_list = options[:select] ? options[:select] : []
81
- filter_list = filter_list.map {|f| glob_to_regex(f) }
82
- output = STDOUT.tty? ? STDOUT : STDERR
83
-
84
- batch_order = Unipept::BatchOrder.new
85
-
86
- printed_header = false
87
- result = []
88
-
89
- hydra = Typhoeus::Hydra.new(max_concurrency: 20)
90
- num_req = 0
91
-
92
- peptide_iterator(peptides) do |sub_division, i|
93
- request = Typhoeus::Request.new(
94
- @url,
95
- method: :post,
96
- body: url_options(sub_division),
97
- accept_encoding: "gzip"
98
- )
99
- request.on_complete do |resp|
100
- if resp.timed_out?
101
- $stderr.puts "request timed out, continuing anyway, but results might be incomplete"
102
- else
103
- if resp.success?
104
- # if JSON parsing goes wrong
105
- sub_result = JSON[resp.response_body] rescue []
106
- sub_result = [sub_result] if not sub_result.kind_of? Array
107
-
108
- sub_result.map! {|r| r.select! {|k,v| filter_list.any? {|f| f.match k } } } if ! filter_list.empty?
109
-
110
- if options[:xml]
111
- result << sub_result
112
- end
113
-
114
- # wait till it's our turn to write
115
- batch_order.wait(i) do
116
- if ! sub_result.empty?
117
- if ! printed_header
118
- write_to_output formatter.header(sub_result)
119
- printed_header = true
120
- end
121
- write_to_output formatter.format(sub_result)
122
- end
123
- end
124
- else
125
- path = File.expand_path(File.join(Dir.home, "unipept.log"))
126
- File.open(path, "w") do |f|
127
- f.write resp.response_body
128
- end
129
- $stderr.puts "API request failed! log can be found in #{path}"
37
+ def download_xml(result)
38
+ if options[:xml]
39
+ FileUtils.mkdir_p(options[:xml])
40
+ result.first.each do |prot|
41
+ File.open(options[:xml] + "/#{prot['uniprot_id']}.xml", "wb") do |f|
42
+ f.write Typhoeus.get("http://www.uniprot.org/uniprot/#{prot['uniprot_id']}.xml").response_body
130
43
  end
131
44
  end
132
45
  end
133
- hydra.queue request
134
-
135
- num_req += 1
136
- if num_req % 200 == 0
137
- hydra.run
138
- end
139
-
140
46
  end
141
47
 
142
- hydra.run
143
-
144
- begin
145
- download_xml(result)
146
- rescue
147
- STDERR.puts "Something went wrong while downloading xml information! please check the output"
48
+ def batch_size
49
+ 10
148
50
  end
149
-
150
- end
151
-
152
- def write_to_output(string)
153
- if options[:output]
154
- File.open(options[:output], 'a') do |f|
155
- f.write string
156
- end
157
- else
158
- puts string
159
- end
160
- end
161
-
162
-
163
- def download_xml(result)
164
- if options[:xml]
165
- File.open(options[:xml] + ".xml", "wb") do |f|
166
- f.write Typhoeus.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=#{result.first.map{|h| h['taxon_id'] }.join(",")}&retmode=xml").response_body
167
- end
168
- end
169
- end
170
-
171
- def peptide_iterator(peptides, &block)
172
- peptides.each_slice(batch_size).with_index(&block)
173
- end
174
-
175
- private
176
-
177
- def glob_to_regex(glob_string)
178
- # only implement * -> . for now
179
- Regexp.new glob_string.gsub("*", ".*")
180
- end
181
- end
182
-
183
- class Taxa2lca < ApiRunner
184
-
185
- def mapping
186
- {"taxa2lca" => "taxa2lca"}
187
- end
188
-
189
- def peptide_iterator(peptides, &block)
190
- block.call(peptides.to_a, 0)
191
- end
192
-
193
- def batch_size
194
- raise "NOT NEEDED FOR TAXA2LCA"
195
51
  end
196
52
 
197
- end
198
-
199
- class Pept2prot < ApiRunner
200
-
201
- def mapping
202
- {"pept2prot" => "pept2prot"}
203
- end
204
-
205
- def download_xml(result)
206
- if options[:xml]
207
- FileUtils.mkdir_p(options[:xml])
208
- result.first.each do |prot|
209
- File.open(options[:xml] + "/#{prot['uniprot_id']}.xml", "wb") do |f|
210
- f.write Typhoeus.get("http://www.uniprot.org/uniprot/#{prot['uniprot_id']}.xml").response_body
211
- end
212
- end
53
+ class Taxonomy < ApiRunner
54
+ def mapping
55
+ {"taxonomy" => "taxonomy"}
213
56
  end
214
57
  end
215
-
216
- def batch_size
217
- 10
218
- end
219
-
220
58
  end
221
59
 
222
60
  root_cmd = Cri::Command.new_basic_root.modify do
@@ -227,6 +65,9 @@ root_cmd = Cri::Command.new_basic_root.modify do
227
65
  option :o, :output, "output file", :argument => :required
228
66
  option :f, :format, "output format (available: #{Unipept::Formatter.available.join "," }) (default: #{Unipept::Formatter.default})", :argument => :required
229
67
 
68
+ # Configuration options
69
+ option nil, "config-host", "Override host setting", argument: :required
70
+
230
71
  run do |opts, args, cmd|
231
72
  if opts[:version]
232
73
  puts File.read(File.join(File.dirname(__FILE__), "..", "VERSION"))
@@ -260,7 +101,7 @@ root_cmd.define_command('pept2taxa') do
260
101
  option :a, :extra, "Show full lineage"
261
102
  option :x, :xml, "Download taxonomy from NCBI as xml (specify output filename)", :argument => :required
262
103
 
263
- runner ApiRunner
104
+ runner Unipept::ApiRunner
264
105
  end
265
106
 
266
107
  root_cmd.define_command('pept2lca') do
@@ -273,7 +114,7 @@ root_cmd.define_command('pept2lca') do
273
114
  option :s, :select, "select the attributes", :argument => :required, :multiple => true
274
115
  option :a, :extra, "Show full lineage"
275
116
 
276
- runner ApiRunner
117
+ runner Unipept::ApiRunner
277
118
  end
278
119
 
279
120
  root_cmd.define_command('taxa2lca') do
@@ -285,7 +126,7 @@ root_cmd.define_command('taxa2lca') do
285
126
  option :s, :select, "select the attributes", :argument => :required, :multiple => true
286
127
  option :a, :extra, "Show full lineage"
287
128
 
288
- runner Taxa2lca
129
+ runner Unipept::Taxa2lca
289
130
  end
290
131
 
291
132
  root_cmd.define_command('pept2prot') do
@@ -299,7 +140,19 @@ root_cmd.define_command('pept2prot') do
299
140
  option :x, :xml, "download uniprot record in specified directory", :argument => :required
300
141
  flag :a, :extra, "include all information. WARNING: will take much longer!"
301
142
 
302
- runner Pept2prot
143
+ runner Unipept::Pept2prot
144
+ end
145
+
146
+ root_cmd.define_command('taxonomy') do
147
+ usage 'taxonomy [options]'
148
+ aliases :tax
149
+ summary 'Give NCBI taxonomy information on given input taxon ids'
150
+ description 'Returns information for each input taxon id'
151
+
152
+ option :s, :select, "select the attributes", :argument => :required, :multiple => true
153
+ flag :a, :extra, "include all information. WARNING: will take much longer!"
154
+
155
+ runner Unipept::Taxonomy
303
156
  end
304
157
 
305
158
  root_cmd.run(ARGV)
@@ -0,0 +1,217 @@
1
+ module Unipept
2
+ class ApiRunner < Cri::CommandRunner
3
+
4
+ def initialize(args, opts, cmd)
5
+ super
6
+ @configuration = Unipept::Configuration.new
7
+
8
+ set_configuration
9
+
10
+ @url = "#{@host}/api/v1/#{mapping[cmd.name]}.json"
11
+ @message_url = "#{@host}/api/v1/messages.json"
12
+ end
13
+
14
+ def set_configuration
15
+ # find host in opts first
16
+ if options[:'config-host']
17
+ host = options[:'config-host']
18
+ else
19
+ host = @configuration['host']
20
+ end
21
+
22
+ # No host has been set?
23
+ if host.nil? || host.empty?
24
+ puts "WARNING: no host has been set, you can set the host with `unipept config host http://localhost:3000/`"
25
+ exit 1
26
+ end
27
+ if !host.start_with? "http://"
28
+ host = "http://#{host}"
29
+ end
30
+
31
+ @host = host
32
+ end
33
+
34
+ def mapping
35
+ {'pept2taxa' => 'pept2taxa', 'pept2lca' => 'pept2lca'}
36
+ end
37
+
38
+ def input_iterator
39
+ # Argument over file input over stdin
40
+ if !arguments.empty?
41
+ arguments.each
42
+ else
43
+ if options[:input]
44
+ IO.foreach(options[:input])
45
+ else
46
+ STDIN.each_line
47
+ end
48
+ end
49
+ end
50
+
51
+ def batch_size
52
+ 100
53
+ end
54
+
55
+ def url_options(sub_part)
56
+ filter = options[:select] ? options[:select] : []
57
+ if filter.empty?
58
+ names = true
59
+ else
60
+ names = filter.any? {|f| /.*name.*/.match f}
61
+ end
62
+ {:input => sub_part,
63
+ :equate_il => options[:equate],
64
+ :extra => options[:extra],
65
+ :names => names,
66
+ }
67
+ end
68
+
69
+ def get_server_message
70
+ return if options[:quiet]
71
+ return unless STDOUT.tty?
72
+ last_fetched = @configuration['last_fetch_date']
73
+ if last_fetched.nil? || (last_fetched + 60 * 60 * 24) < Time.now
74
+ version = File.read(File.join(File.dirname(__FILE__), "..", "VERSION"))
75
+ puts Typhoeus.get(@message_url, params: {version: version}).body
76
+
77
+ @configuration['last_fetch_date'] = Time.now
78
+ @configuration.save
79
+ end
80
+ end
81
+
82
+ def run
83
+ get_server_message
84
+
85
+ formatter = Unipept::Formatter.new_for_format(options[:format])
86
+ peptides = input_iterator
87
+
88
+ filter_list = options[:select] ? options[:select] : []
89
+ filter_list = filter_list.map {|f| glob_to_regex(f) }
90
+ output = STDOUT.tty? ? STDOUT : STDERR
91
+
92
+ batch_order = Unipept::BatchOrder.new
93
+
94
+ printed_header = false
95
+ result = []
96
+
97
+ hydra = Typhoeus::Hydra.new(max_concurrency: 10)
98
+ num_req = 0
99
+
100
+ peptide_iterator(peptides) do |sub_division, i, fasta_mapper|
101
+ request = Typhoeus::Request.new(
102
+ @url,
103
+ method: :post,
104
+ body: url_options(sub_division),
105
+ accept_encoding: "gzip"
106
+ )
107
+ request.on_complete do |resp|
108
+ if resp.timed_out?
109
+ $stderr.puts "request timed out, continuing anyway, but results might be incomplete"
110
+ else
111
+ if resp.success?
112
+ # if JSON parsing goes wrong
113
+ sub_result = JSON[resp.response_body] rescue []
114
+ sub_result = [sub_result] if not sub_result.kind_of? Array
115
+
116
+ sub_result.map! {|r| r.select! {|k,v| filter_list.any? {|f| f.match k } } } if ! filter_list.empty?
117
+
118
+ if options[:xml]
119
+ result << sub_result
120
+ end
121
+
122
+ # wait till it's our turn to write
123
+ batch_order.wait(i) do
124
+ if ! sub_result.empty?
125
+ if ! printed_header
126
+ write_to_output formatter.header(sub_result, fasta_mapper)
127
+ printed_header = true
128
+ end
129
+ write_to_output formatter.format(sub_result, fasta_mapper)
130
+ end
131
+ end
132
+ else
133
+ save_error(resp.response_body)
134
+ end
135
+ end
136
+ end
137
+ hydra.queue request
138
+
139
+ num_req += 1
140
+ if num_req % 200 == 0
141
+ hydra.run
142
+ end
143
+
144
+ end
145
+
146
+ hydra.run
147
+
148
+ begin
149
+ download_xml(result)
150
+ rescue
151
+ STDERR.puts "Something went wrong while downloading xml information! please check the output"
152
+ end
153
+
154
+ end
155
+
156
+ def save_error(message)
157
+ path = File.expand_path(File.join(Dir.home, ".unipept", "unipept-#{Time.now.strftime("%F-%T")}.log"))
158
+ FileUtils.mkdir_p File.dirname(path)
159
+ File.open(path, "w") do |f|
160
+ f.write message
161
+ end
162
+ $stderr.puts "API request failed! log can be found in #{path}"
163
+ end
164
+
165
+ def write_to_output(string)
166
+ if options[:output]
167
+ File.open(options[:output], 'a') do |f|
168
+ f.write string
169
+ end
170
+ else
171
+ puts string
172
+ end
173
+ end
174
+
175
+
176
+ def download_xml(result)
177
+ if options[:xml]
178
+ File.open(options[:xml] + ".xml", "wb") do |f|
179
+ f.write Typhoeus.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=#{result.first.map{|h| h['taxon_id'] }.join(",")}&retmode=xml").response_body
180
+ end
181
+ end
182
+ end
183
+
184
+ def peptide_iterator(peptides, &block)
185
+ first = peptides.first
186
+ if first.start_with? '>'
187
+ # FASTA MODE ENGAGED
188
+ fasta_header = first
189
+ peptides.each_slice(batch_size).with_index do |sub,i|
190
+ fasta_mapper = {}
191
+ sub.map! {|s| s.chomp}
192
+ j = 0
193
+ while j < sub.size
194
+ if sub[j].start_with? '>'
195
+ fasta_header = sub[j]
196
+ else
197
+ fasta_mapper[sub[j]] = fasta_header
198
+ end
199
+ j += 1
200
+ end
201
+ sub -= fasta_mapper.values.uniq
202
+ block.call(sub, i, fasta_mapper)
203
+ end
204
+
205
+ else
206
+ peptides.each_slice(batch_size).with_index(&block)
207
+ end
208
+ end
209
+
210
+ private
211
+
212
+ def glob_to_regex(glob_string)
213
+ # only implement * -> . for now
214
+ Regexp.new glob_string.gsub("*", ".*")
215
+ end
216
+ end
217
+ end
@@ -30,7 +30,7 @@ module Unipept
30
30
  end
31
31
 
32
32
  # JSON formatted data goes in, something other comes out
33
- def format(data)
33
+ def format(data, fasta_mapper = nil)
34
34
  data
35
35
  end
36
36
  end
@@ -49,23 +49,40 @@ module Unipept
49
49
 
50
50
  register :csv
51
51
 
52
- def header(data)
52
+ def header(data, fasta_mapper = nil)
53
53
  CSV.generate do |csv|
54
54
  first = data.first
55
55
  if first.kind_of? Array
56
56
  first = first.first
57
57
  end
58
- csv << first.keys.map(&:to_s) if first
58
+ if fasta_mapper
59
+ csv << (['fasta_header'] + first.keys).map(&:to_s) if first
60
+ else
61
+ csv << first.keys.map(&:to_s) if first
62
+ end
63
+
59
64
  end
60
65
  end
61
66
 
62
- def format(data)
67
+ def format(data, fasta_mapper = nil)
63
68
  CSV.generate do |csv|
64
69
  data.each do |o|
65
70
  if o.kind_of? Array
66
- o.each {|h| csv << h.values.map { |v| v == "" ? nil : v }}
71
+ o.each do |h|
72
+ if fasta_mapper
73
+ extra_key = [fasta_mapper[h.values.first]]
74
+ csv << (extra_key + h.values).map { |v| v == "" ? nil : v }
75
+ else
76
+ csv << h.values.map { |v| v == "" ? nil : v }
77
+ end
78
+ end
67
79
  else
68
- csv << o.values.map { |v| v == "" ? nil : v }
80
+ if fasta_mapper
81
+ extra_key = [fasta_mapper[o.values.first]]
82
+ csv << (extra_key + o.values).map { |v| v == "" ? nil : v }
83
+ else
84
+ csv << o.values.map { |v| v == "" ? nil : v }
85
+ end
69
86
  end
70
87
  end
71
88
  end
data/lib/unipept.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require_relative 'unipept/formatters'
2
2
  require_relative 'unipept/configuration'
3
3
  require_relative 'unipept/batch_order'
4
+ require_relative 'unipept/api_runner'
4
5
 
5
6
  module Unipept
6
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unipept
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Toon Willems
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-14 00:00:00.000000000 Z
11
+ date: 2014-05-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: shoulda
@@ -131,6 +131,7 @@ files:
131
131
  - bin/unipept
132
132
  - bin/uniprot
133
133
  - lib/unipept.rb
134
+ - lib/unipept/api_runner.rb
134
135
  - lib/unipept/batch_order.rb
135
136
  - lib/unipept/configuration.rb
136
137
  - lib/unipept/formatters.rb