unipept 0.7.1 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +26 -0
  3. data/.travis.yml +7 -0
  4. data/Gemfile +8 -10
  5. data/Gemfile.lock +35 -21
  6. data/README.md +6 -4
  7. data/Rakefile +11 -10
  8. data/VERSION +1 -1
  9. data/bin/peptfilter +2 -44
  10. data/bin/prot2pept +4 -49
  11. data/bin/unipept +2 -197
  12. data/bin/uniprot +4 -53
  13. data/lib/batch_iterator.rb +73 -0
  14. data/lib/batch_order.rb +20 -0
  15. data/lib/commands/peptfilter.rb +118 -0
  16. data/lib/commands/prot2pept.rb +61 -0
  17. data/lib/commands/unipept/api_runner.rb +199 -0
  18. data/lib/commands/unipept/config.rb +29 -0
  19. data/lib/commands/unipept/pept2lca.rb +12 -0
  20. data/lib/commands/unipept/pept2prot.rb +13 -0
  21. data/lib/{unipept/commands → commands/unipept}/pept2taxa.rb +7 -0
  22. data/lib/commands/unipept/taxa2lca.rb +18 -0
  23. data/lib/{unipept/commands → commands/unipept}/taxonomy.rb +3 -0
  24. data/lib/commands/unipept.rb +226 -0
  25. data/lib/commands/uniprot.rb +69 -0
  26. data/lib/commands.rb +10 -0
  27. data/lib/configuration.rb +45 -0
  28. data/lib/formatters.rb +252 -0
  29. data/lib/version.rb +3 -0
  30. data/test/commands/test_peptfilter.rb +170 -0
  31. data/test/commands/test_prot2pept.rb +82 -0
  32. data/test/commands/test_unipept.rb +37 -0
  33. data/test/commands/test_uniprot.rb +136 -0
  34. data/test/commands/unipept/test_api_runner.rb +486 -0
  35. data/test/commands/unipept/test_config.rb +64 -0
  36. data/test/commands/unipept/test_pept2lca.rb +40 -0
  37. data/test/commands/unipept/test_pept2prot.rb +39 -0
  38. data/test/commands/unipept/test_pept2taxa.rb +39 -0
  39. data/test/commands/unipept/test_taxa2lca.rb +39 -0
  40. data/test/commands/unipept/test_taxonomy.rb +37 -0
  41. data/test/helper.rb +69 -23
  42. data/test/test_bach_order.rb +57 -0
  43. data/test/test_base.rb +6 -0
  44. data/test/test_batch_iterator.rb +87 -0
  45. data/test/test_configuration.rb +43 -0
  46. data/test/test_formatters.rb +140 -0
  47. data/unipept.gemspec +55 -33
  48. metadata +62 -40
  49. data/lib/unipept/batch_order.rb +0 -28
  50. data/lib/unipept/commands/api_runner.rb +0 -239
  51. data/lib/unipept/commands/pept2lca.rb +0 -6
  52. data/lib/unipept/commands/pept2prot.rb +0 -20
  53. data/lib/unipept/commands/taxa2lca.rb +0 -12
  54. data/lib/unipept/commands.rb +0 -7
  55. data/lib/unipept/configuration.rb +0 -29
  56. data/lib/unipept/formatters.rb +0 -135
  57. data/lib/unipept/version.rb +0 -3
  58. data/lib/unipept.rb +0 -8
  59. data/test/test_unipept.rb +0 -7
@@ -0,0 +1,73 @@
1
+ require 'set'
2
+
3
+ module Unipept
4
+ class BatchIterator
5
+ attr_reader :batch_size
6
+
7
+ def initialize(batch_size)
8
+ @batch_size = batch_size
9
+ end
10
+
11
+ # Splits the input lines into slices, based on the batch_size of the current
12
+ # command. Executes the given block for each of the batches.
13
+ #
14
+ # Supports both normal input and input in the fasta format.
15
+ #
16
+ # @input [Iterator] lines An iterator containing the input lines
17
+ #
18
+ # @input [lambda] block The code to execute on the slices
19
+ def iterate(lines, &block)
20
+ first_line = lines.next rescue return
21
+ if fasta? first_line
22
+ fasta_iterator(first_line, lines, &block)
23
+ else
24
+ normal_iterator(first_line, lines, &block)
25
+ end
26
+ end
27
+
28
+ # Checks if the geven line is a fasta header.
29
+ #
30
+ # @param [String] line The input line
31
+ #
32
+ # @return [Boolean] Whether te input is a fasta header
33
+ def fasta?(line)
34
+ line.start_with? '>'
35
+ end
36
+
37
+ private
38
+
39
+ # Splits the input lines in fasta format into slices, based on the
40
+ # batch_size of the current command. Executes the given block for each of
41
+ # the batches.
42
+ def fasta_iterator(first_line, next_lines, &block)
43
+ current_fasta_header = first_line.chomp
44
+ next_lines.each_slice(batch_size).with_index do |slice, i|
45
+ fasta_mapper = []
46
+ input_set = Set.new
47
+
48
+ slice.each do |line|
49
+ line.chomp!
50
+ if fasta? line
51
+ current_fasta_header = line
52
+ else
53
+ fasta_mapper << [current_fasta_header, line]
54
+ input_set << line
55
+ end
56
+ end
57
+
58
+ block.call(input_set.to_a, i, fasta_mapper)
59
+ end
60
+ end
61
+
62
+ # Splits the input lines into slices, based on the batch_size of the current
63
+ # command. Executes the given block for each of the batches.
64
+ def normal_iterator(first_line, next_lines, &block)
65
+ Enumerator.new do |y|
66
+ y << first_line
67
+ loop do
68
+ y << next_lines.next
69
+ end
70
+ end.each_slice(batch_size).with_index(&block)
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,20 @@
1
+ module Unipept
2
+ class BatchOrder
3
+ attr_reader :order
4
+
5
+ def initialize
6
+ @order = {}
7
+ @current = 0
8
+ end
9
+
10
+ # Executes block if it's its turn, queues the block in the other case.
11
+ def wait(i, &block)
12
+ @order[i] = block
13
+ return unless i == @current
14
+ while order[@current]
15
+ order.delete(@current).call
16
+ @current += 1
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,118 @@
1
+ module Unipept::Commands
2
+ class Peptfilter
3
+ attr_reader :root_command
4
+
5
+ @root_command = Cri::Command.define do
6
+ name 'peptfilter'
7
+ summary 'Filter peptides based on specific criteria.'
8
+ usage 'peptfilter [options]'
9
+ description <<-EOS
10
+ The peptfilter command filters a list of peptides according to specific criteria. The command expects a list of peptides that are passed to standard input.
11
+
12
+ The input should have one peptide per line. FASTA headers are preserved in the output, so that peptides remain bundled.
13
+ EOS
14
+ # flag :u, :unique, "filter duplicate peptides."
15
+ required nil, :minlen, 'only retain tryptic peptides that have at least min (default: 5) amino acids.'
16
+ required nil, :maxlen, 'only retain tryptic peptides that have at most max (default: 50) amino acids.'
17
+ required :l, :lacks, 'only retain tryptic peptides that lack all amino acids from the string of residues.'
18
+ required :c, :contains, 'only retain tryptic peptides that contain all amino acids from the string of residues.'
19
+ flag :h, :help, 'show help for this command' do |_value, cmd|
20
+ puts cmd.help
21
+ exit 0
22
+ end
23
+ run do |opts, _args, _cmd|
24
+ minlen = opts.fetch(:minlen, '5').to_i
25
+ maxlen = opts.fetch(:maxlen, '50').to_i
26
+ lacks = opts.fetch(:lacks, '').chars.to_a
27
+ contains = opts.fetch(:contains, '').chars.to_a
28
+ $stdin.each_line do |pept|
29
+ # FASTA headers
30
+ if pept.start_with? '>'
31
+ puts pept
32
+ next
33
+ end
34
+
35
+ pept = pept.chomp
36
+ puts pept if Peptfilter.filter(pept, minlen, maxlen, lacks, contains)
37
+ end
38
+ end
39
+ end
40
+
41
+ @root_command.add_command(Cri::Command.new_basic_help)
42
+
43
+ # Invokes the peptfilter command-line tool with the given arguments.
44
+ #
45
+ # @param [Array<String>] args An array of command-line arguments
46
+ #
47
+ # @return [void]
48
+ def self.run(args)
49
+ @root_command.run(args)
50
+ end
51
+
52
+ # Checks if a peptide satisfies the min length, max length, lacks and contains requirements.
53
+ # Returns true if
54
+ # - the peptide length is equal or higher than min
55
+ # - the peptide length is equal or lower than max
56
+ # - the peptide doesn't contain any of the amino acids in lacks
57
+ # - the peptide contains all of the amino acids in contains
58
+ #
59
+ # @param [String] peptide The peptide to check
60
+ #
61
+ # @param [Integer] min The minimal length requirement
62
+ #
63
+ # @param [Integer] max The maximal length requirement
64
+ #
65
+ # @param [Array<String>] lacks The forbidden amino acids
66
+ #
67
+ # @param [Array<String>] contains The required amino acids
68
+ #
69
+ # @return [Boolean] true if the peptide satisfies all requirements
70
+ def self.filter(peptide, min, max, lacks, contains)
71
+ filter_length(peptide, min, max) &&
72
+ filter_lacks(peptide, lacks) &&
73
+ filter_contains(peptide, contains)
74
+ end
75
+
76
+ # Checks if a peptide satisfies the min length and max length requirements.
77
+ # Returns true if
78
+ # - the peptide length is equal or higher than min
79
+ # - the peptide length is equal or lower than max
80
+ #
81
+ # @param [String] peptide The peptide to check
82
+ #
83
+ # @param [Integer] min The minimal length requirement
84
+ #
85
+ # @param [Integer] max The maximal length requirement
86
+ #
87
+ # @return [Boolean] true if the peptide satisfies all requirements
88
+ def self.filter_length(peptide, min, max)
89
+ peptide.length >= min && peptide.length <= max
90
+ end
91
+
92
+ # Checks if a peptide satisfies lacks requirement.
93
+ # Returns true if
94
+ # - the peptide doesn't contain any of the amino acids in lacks
95
+ #
96
+ # @param [String] peptide The peptide to check
97
+ #
98
+ # @param [Array<String>] lacks The forbidden amino acids
99
+ #
100
+ # @return [Boolean] true if the peptide satisfies all requirements
101
+ def self.filter_lacks(peptide, lacks)
102
+ (peptide.chars.to_a & lacks).size == 0
103
+ end
104
+
105
+ # Checks if a peptide satisfies the contains requirement.
106
+ # Returns true if
107
+ # - the peptide contains all of the amino acids in contains
108
+ #
109
+ # @param [String] peptide The peptide to check
110
+ #
111
+ # @param [Array<String>] contains The required amino acids
112
+ #
113
+ # @return [Boolean] true if the peptide satisfies all requirements
114
+ def self.filter_contains(peptide, contains)
115
+ (peptide.chars.to_a & contains).size == contains.size
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,61 @@
1
+ module Unipept::Commands
2
+ class Prot2pept
3
+ attr_reader :root_command
4
+ attr_reader :valid_formats
5
+
6
+ @root_command = Cri::Command.define do
7
+ name 'prot2pept'
8
+ summary 'Split protein sequences into peptides.'
9
+ usage 'prot2pept [options]'
10
+ description <<-EOS
11
+ The prot2pept command splits each protein sequence into a list of peptides according to a given cleavage-pattern. The command expects a list of protein sequences that are passed to standard input.
12
+
13
+ The input should have either one protein sequence per line or contain a FASTA formatted list of protein sequences. FASTA headers are preserved in the output, so that peptides can be bundled per protein sequence.
14
+
15
+ EOS
16
+ required :p, :pattern, 'specify cleavage-pattern (regex) as the pattern after which the next peptide will be cleaved (default: ([KR])([^P]) for tryptic peptides).'
17
+ flag :h, :help, 'show help for this command' do |_value, cmd|
18
+ puts cmd.help
19
+ exit 0
20
+ end
21
+ run do |opts, _args, _cmd|
22
+ pattern = opts.fetch(:pattern, '([KR])([^P])')
23
+
24
+ # decide if we have FASTA input
25
+ first_char = $stdin.getc
26
+ $stdin.ungetc(first_char)
27
+ if first_char == '>'
28
+ # fasta mode!
29
+ protein = ''
30
+ while (line = $stdin.gets)
31
+ if line.start_with? '>'
32
+ puts Prot2pept.split(protein, pattern)
33
+ protein = ''
34
+ puts line
35
+ else
36
+ protein += line.chomp
37
+ end
38
+ end
39
+ puts Prot2pept.split(protein, pattern)
40
+ else
41
+ $stdin.each_line do |prot|
42
+ puts Prot2pept.split(prot, pattern)
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+ def self.split(protein, pattern)
49
+ protein.gsub(/#{pattern}/, "\\1\n\\2").gsub(/#{pattern}/, "\\1\n\\2").split("\n").reject(&:empty?)
50
+ end
51
+
52
+ # Invokes the uniprot command-line tool with the given arguments.
53
+ #
54
+ # @param [Array<String>] args An array of command-line arguments
55
+ #
56
+ # @return [void]
57
+ def self.run(args)
58
+ @root_command.run(args)
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,199 @@
1
+ module Unipept
2
+ class Commands::ApiRunner < Cri::CommandRunner
3
+ attr_reader :configuration
4
+
5
+ attr_reader :url
6
+
7
+ attr_reader :message_url
8
+
9
+ attr_reader :user_agent
10
+
11
+ def initialize(args, opts, cmd)
12
+ super
13
+ @configuration = Unipept::Configuration.new
14
+ set_configuration
15
+
16
+ @url = "#{@host}/api/v1/#{cmd.name}.json"
17
+ @message_url = "#{@host}/api/v1/messages.json"
18
+ end
19
+
20
+ # Sets the configurable options of the command line app:
21
+ # - the host
22
+ # - the user agent
23
+ def set_configuration
24
+ @host = host
25
+ @user_agent = 'Unipept CLI - unipept ' + Unipept::VERSION
26
+ end
27
+
28
+ # Returns the host. If a value is defined by both an option and the config
29
+ # file, the value of the option is used.
30
+ def host
31
+ # find host in opts first
32
+ host = options[:host] ? options[:host] : @configuration['host']
33
+
34
+ # No host has been set?
35
+ if host.nil? || host.empty?
36
+ abort 'WARNING: no host has been set, you can set the host with `unipept config host http://api.unipept.ugent.be/`'
37
+ end
38
+
39
+ # add http:// if needed
40
+ if host.start_with?('http://') || host.start_with?('https://')
41
+ host
42
+ else
43
+ "http://#{host}"
44
+ end
45
+ end
46
+
47
+ # Returns an input iterator to use for the request.
48
+ # - if arguments are given, uses arguments
49
+ # - if the input file option is given, uses file input
50
+ # - if none of the previous are given, uses stdin
51
+ def input_iterator
52
+ return arguments.each unless arguments.empty?
53
+ return IO.foreach(options[:input]) if options[:input]
54
+ $stdin.each_line
55
+ end
56
+
57
+ # Returns the default batch_size of a command.
58
+ def batch_size
59
+ 100
60
+ end
61
+
62
+ # Constructs a request body (a Hash) for set of input strings, using the
63
+ # options supplied by the user.
64
+ def construct_request_body(input)
65
+ names = selected_fields.empty? || selected_fields.any? { |f| f.to_s.include? 'name' }
66
+ { input: input,
67
+ equate_il: options[:equate] == true,
68
+ extra: options[:all] == true,
69
+ names: options[:all] == true && names
70
+ }
71
+ end
72
+
73
+ # Returns an array of regular expressions containing all the selected fields
74
+ def selected_fields
75
+ @selected_fields ||= [*options[:select]].map { |f| f.split(',') }.flatten.map { |f| glob_to_regex(f) }
76
+ end
77
+
78
+ # Returns a formatter, based on the format specified in the options
79
+ def formatter
80
+ @formatter ||= Unipept::Formatter.new_for_format(options[:format])
81
+ end
82
+
83
+ # Checks if the server has a message and prints it if not empty.
84
+ # We will only check this once a day and won't print anything if the quiet
85
+ # option is set or if we output to a file.
86
+ def print_server_message
87
+ return if options[:quiet]
88
+ return unless $stdout.tty?
89
+ return if recently_fetched?
90
+ @configuration['last_fetch_date'] = Time.now
91
+ @configuration.save
92
+ resp = fetch_server_message
93
+ puts resp unless resp.empty?
94
+ end
95
+
96
+ # Fetches a message from the server and returns it
97
+ def fetch_server_message
98
+ Typhoeus.get(@message_url, params: { version: Unipept::VERSION }).body.chomp
99
+ end
100
+
101
+ # Returns true if the last check for a server message was less than a day
102
+ # ago.
103
+ def recently_fetched?
104
+ last_fetched = @configuration['last_fetch_date']
105
+ !last_fetched.nil? && (last_fetched + 60 * 60 * 24) > Time.now
106
+ end
107
+
108
+ # Returns a new batch_iterator based on the batch_size
109
+ def batch_iterator
110
+ Unipept::BatchIterator.new(batch_size)
111
+ end
112
+
113
+ # Runs the command
114
+ def run
115
+ print_server_message
116
+ hydra = Typhoeus::Hydra.new(max_concurrency: 10)
117
+ batch_order = Unipept::BatchOrder.new
118
+
119
+ batch_iterator.iterate(input_iterator) do |input_slice, batch_id, fasta_mapper|
120
+ request = Typhoeus::Request.new(
121
+ @url,
122
+ method: :post,
123
+ body: construct_request_body(input_slice),
124
+ accept_encoding: 'gzip',
125
+ headers: { 'User-Agent' => @user_agent }
126
+ )
127
+
128
+ request.on_complete do |resp|
129
+ block = handle_response(resp, batch_id, fasta_mapper)
130
+ batch_order.wait(batch_id, &block)
131
+ end
132
+
133
+ hydra.queue request
134
+ hydra.run if batch_id % 200 == 0
135
+ end
136
+
137
+ hydra.run
138
+ end
139
+
140
+ # Saves an error to a new file in the .unipept directory in the users home
141
+ # directory.
142
+ def save_error(message)
143
+ path = error_file_path
144
+ FileUtils.mkdir_p File.dirname(path)
145
+ File.open(path, 'w') { |f| f.write message }
146
+ $stderr.puts "API request failed! log can be found in #{path}"
147
+ end
148
+
149
+ # Write a string to the output defined by the command. If a file is given,
150
+ # write it to the file. If not, write to stdout
151
+ def write_to_output(string)
152
+ if options[:output]
153
+ File.open(options[:output], 'a') { |f| f.write string }
154
+ else
155
+ puts string
156
+ end
157
+ end
158
+
159
+ private
160
+
161
+ def error_file_path
162
+ File.expand_path(File.join(Dir.home, '.unipept', "unipept-#{Time.now.strftime('%F-%T')}.log"))
163
+ end
164
+
165
+ # Handles the response of an API request.
166
+ # Returns a block to execute.
167
+ def handle_response(response, batch_id, fasta_mapper)
168
+ if response.success?
169
+ result = filter_result(response.response_body)
170
+
171
+ lambda do
172
+ unless result.empty?
173
+ write_to_output formatter.header(result, fasta_mapper) if batch_id == 0
174
+ write_to_output formatter.format(result, fasta_mapper)
175
+ end
176
+ end
177
+ elsif response.timed_out?
178
+ -> { save_error('request timed out, continuing anyway, but results might be incomplete') }
179
+ elsif response.code == 0
180
+ -> { save_error('could not get an http response, continuing anyway, but results might be incomplete' + response.return_message) }
181
+ else
182
+ -> { save_error("Got #{response.code}: #{response.response_body}\nRequest headers: #{response.request.options}\nRequest body:\n#{response.request.encoded_body}\n\n") }
183
+ end
184
+ end
185
+
186
+ # Parses the json_response, wraps it in an array if needed and filters the
187
+ # fields based on the selected_fields
188
+ def filter_result(json_response)
189
+ result = JSON[json_response] rescue []
190
+ result = [result] unless result.is_a? Array
191
+ result.map! { |r| r.select! { |k, _v| selected_fields.any? { |f| f.match k } } } unless selected_fields.empty?
192
+ result
193
+ end
194
+
195
+ def glob_to_regex(string)
196
+ /^#{string.gsub('*', '.*')}$/
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,29 @@
1
+ module Unipept
2
+ class Commands::Config < Cri::CommandRunner
3
+ def run
4
+ abort command.help if arguments.size == 0 || arguments.size > 2
5
+
6
+ key, value = *arguments
7
+
8
+ if arguments.size == 2
9
+ set_config(key, value)
10
+ puts key + ' was set to ' + value
11
+ else
12
+ puts get_config(key)
13
+ end
14
+ end
15
+
16
+ def config
17
+ @config ||= Unipept::Configuration.new
18
+ end
19
+
20
+ def set_config(key, value)
21
+ config[key] = value
22
+ config.save
23
+ end
24
+
25
+ def get_config(key)
26
+ config[key]
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,12 @@
1
+ require_relative 'api_runner'
2
+ module Unipept::Commands
3
+ class Pept2lca < ApiRunner
4
+ def batch_size
5
+ if options[:all]
6
+ 100
7
+ else
8
+ 1000
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,13 @@
1
+ require_relative 'api_runner'
2
+
3
+ module Unipept::Commands
4
+ class Pept2prot < ApiRunner
5
+ def batch_size
6
+ if options[:all]
7
+ 5
8
+ else
9
+ 10
10
+ end
11
+ end
12
+ end
13
+ end
@@ -1,5 +1,12 @@
1
1
  require_relative 'api_runner'
2
2
  module Unipept::Commands
3
3
  class Pept2taxa < ApiRunner
4
+ def batch_size
5
+ if options[:all]
6
+ 5
7
+ else
8
+ 10
9
+ end
10
+ end
4
11
  end
5
12
  end
@@ -0,0 +1,18 @@
1
+ require_relative 'api_runner'
2
+ module Unipept::Commands
3
+ class Taxa2lca < ApiRunner
4
+ def batch_iterator
5
+ SimpleBatchIterator.new
6
+ end
7
+
8
+ def batch_size
9
+ fail 'NOT NEEDED FOR TAXA2LCA'
10
+ end
11
+ end
12
+
13
+ class SimpleBatchIterator
14
+ def iterate(input, &block)
15
+ block.call(input.to_a, 0)
16
+ end
17
+ end
18
+ end
@@ -1,5 +1,8 @@
1
1
  require_relative 'api_runner'
2
2
  module Unipept::Commands
3
3
  class Taxonomy < ApiRunner
4
+ def batch_size
5
+ 100
6
+ end
4
7
  end
5
8
  end