unipept 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +26 -0
  3. data/.travis.yml +7 -0
  4. data/Gemfile +8 -10
  5. data/Gemfile.lock +35 -21
  6. data/README.md +6 -4
  7. data/Rakefile +11 -10
  8. data/VERSION +1 -1
  9. data/bin/peptfilter +2 -44
  10. data/bin/prot2pept +4 -49
  11. data/bin/unipept +2 -197
  12. data/bin/uniprot +4 -53
  13. data/lib/batch_iterator.rb +73 -0
  14. data/lib/batch_order.rb +20 -0
  15. data/lib/commands/peptfilter.rb +118 -0
  16. data/lib/commands/prot2pept.rb +61 -0
  17. data/lib/commands/unipept/api_runner.rb +199 -0
  18. data/lib/commands/unipept/config.rb +29 -0
  19. data/lib/commands/unipept/pept2lca.rb +12 -0
  20. data/lib/commands/unipept/pept2prot.rb +13 -0
  21. data/lib/{unipept/commands → commands/unipept}/pept2taxa.rb +7 -0
  22. data/lib/commands/unipept/taxa2lca.rb +18 -0
  23. data/lib/{unipept/commands → commands/unipept}/taxonomy.rb +3 -0
  24. data/lib/commands/unipept.rb +226 -0
  25. data/lib/commands/uniprot.rb +69 -0
  26. data/lib/commands.rb +10 -0
  27. data/lib/configuration.rb +45 -0
  28. data/lib/formatters.rb +252 -0
  29. data/lib/version.rb +3 -0
  30. data/test/commands/test_peptfilter.rb +170 -0
  31. data/test/commands/test_prot2pept.rb +82 -0
  32. data/test/commands/test_unipept.rb +37 -0
  33. data/test/commands/test_uniprot.rb +136 -0
  34. data/test/commands/unipept/test_api_runner.rb +486 -0
  35. data/test/commands/unipept/test_config.rb +64 -0
  36. data/test/commands/unipept/test_pept2lca.rb +40 -0
  37. data/test/commands/unipept/test_pept2prot.rb +39 -0
  38. data/test/commands/unipept/test_pept2taxa.rb +39 -0
  39. data/test/commands/unipept/test_taxa2lca.rb +39 -0
  40. data/test/commands/unipept/test_taxonomy.rb +37 -0
  41. data/test/helper.rb +69 -23
  42. data/test/test_bach_order.rb +57 -0
  43. data/test/test_base.rb +6 -0
  44. data/test/test_batch_iterator.rb +87 -0
  45. data/test/test_configuration.rb +43 -0
  46. data/test/test_formatters.rb +140 -0
  47. data/unipept.gemspec +55 -33
  48. metadata +62 -40
  49. data/lib/unipept/batch_order.rb +0 -28
  50. data/lib/unipept/commands/api_runner.rb +0 -239
  51. data/lib/unipept/commands/pept2lca.rb +0 -6
  52. data/lib/unipept/commands/pept2prot.rb +0 -20
  53. data/lib/unipept/commands/taxa2lca.rb +0 -12
  54. data/lib/unipept/commands.rb +0 -7
  55. data/lib/unipept/configuration.rb +0 -29
  56. data/lib/unipept/formatters.rb +0 -135
  57. data/lib/unipept/version.rb +0 -3
  58. data/lib/unipept.rb +0 -8
  59. data/test/test_unipept.rb +0 -7
@@ -0,0 +1,73 @@
1
+ require 'set'
2
+
3
+ module Unipept
4
+ class BatchIterator
5
+ attr_reader :batch_size
6
+
7
+ def initialize(batch_size)
8
+ @batch_size = batch_size
9
+ end
10
+
11
+ # Splits the input lines into slices, based on the batch_size of the current
12
+ # command. Executes the given block for each of the batches.
13
+ #
14
+ # Supports both normal input and input in the fasta format.
15
+ #
16
+ # @input [Iterator] lines An iterator containing the input lines
17
+ #
18
+ # @input [lambda] block The code to execute on the slices
19
+ def iterate(lines, &block)
20
+ first_line = lines.next rescue return
21
+ if fasta? first_line
22
+ fasta_iterator(first_line, lines, &block)
23
+ else
24
+ normal_iterator(first_line, lines, &block)
25
+ end
26
+ end
27
+
28
+ # Checks if the geven line is a fasta header.
29
+ #
30
+ # @param [String] line The input line
31
+ #
32
+ # @return [Boolean] Whether te input is a fasta header
33
+ def fasta?(line)
34
+ line.start_with? '>'
35
+ end
36
+
37
+ private
38
+
39
+ # Splits the input lines in fasta format into slices, based on the
40
+ # batch_size of the current command. Executes the given block for each of
41
+ # the batches.
42
+ def fasta_iterator(first_line, next_lines, &block)
43
+ current_fasta_header = first_line.chomp
44
+ next_lines.each_slice(batch_size).with_index do |slice, i|
45
+ fasta_mapper = []
46
+ input_set = Set.new
47
+
48
+ slice.each do |line|
49
+ line.chomp!
50
+ if fasta? line
51
+ current_fasta_header = line
52
+ else
53
+ fasta_mapper << [current_fasta_header, line]
54
+ input_set << line
55
+ end
56
+ end
57
+
58
+ block.call(input_set.to_a, i, fasta_mapper)
59
+ end
60
+ end
61
+
62
+ # Splits the input lines into slices, based on the batch_size of the current
63
+ # command. Executes the given block for each of the batches.
64
+ def normal_iterator(first_line, next_lines, &block)
65
+ Enumerator.new do |y|
66
+ y << first_line
67
+ loop do
68
+ y << next_lines.next
69
+ end
70
+ end.each_slice(batch_size).with_index(&block)
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,20 @@
1
+ module Unipept
2
+ class BatchOrder
3
+ attr_reader :order
4
+
5
+ def initialize
6
+ @order = {}
7
+ @current = 0
8
+ end
9
+
10
+ # Executes block if it's its turn, queues the block in the other case.
11
+ def wait(i, &block)
12
+ @order[i] = block
13
+ return unless i == @current
14
+ while order[@current]
15
+ order.delete(@current).call
16
+ @current += 1
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,118 @@
1
+ module Unipept::Commands
2
+ class Peptfilter
3
+ attr_reader :root_command
4
+
5
+ @root_command = Cri::Command.define do
6
+ name 'peptfilter'
7
+ summary 'Filter peptides based on specific criteria.'
8
+ usage 'peptfilter [options]'
9
+ description <<-EOS
10
+ The peptfilter command filters a list of peptides according to specific criteria. The command expects a list of peptides that are passed to standard input.
11
+
12
+ The input should have one peptide per line. FASTA headers are preserved in the output, so that peptides remain bundled.
13
+ EOS
14
+ # flag :u, :unique, "filter duplicate peptides."
15
+ required nil, :minlen, 'only retain tryptic peptides that have at least min (default: 5) amino acids.'
16
+ required nil, :maxlen, 'only retain tryptic peptides that have at most max (default: 50) amino acids.'
17
+ required :l, :lacks, 'only retain tryptic peptides that lack all amino acids from the string of residues.'
18
+ required :c, :contains, 'only retain tryptic peptides that contain all amino acids from the string of residues.'
19
+ flag :h, :help, 'show help for this command' do |_value, cmd|
20
+ puts cmd.help
21
+ exit 0
22
+ end
23
+ run do |opts, _args, _cmd|
24
+ minlen = opts.fetch(:minlen, '5').to_i
25
+ maxlen = opts.fetch(:maxlen, '50').to_i
26
+ lacks = opts.fetch(:lacks, '').chars.to_a
27
+ contains = opts.fetch(:contains, '').chars.to_a
28
+ $stdin.each_line do |pept|
29
+ # FASTA headers
30
+ if pept.start_with? '>'
31
+ puts pept
32
+ next
33
+ end
34
+
35
+ pept = pept.chomp
36
+ puts pept if Peptfilter.filter(pept, minlen, maxlen, lacks, contains)
37
+ end
38
+ end
39
+ end
40
+
41
+ @root_command.add_command(Cri::Command.new_basic_help)
42
+
43
+ # Invokes the peptfilter command-line tool with the given arguments.
44
+ #
45
+ # @param [Array<String>] args An array of command-line arguments
46
+ #
47
+ # @return [void]
48
+ def self.run(args)
49
+ @root_command.run(args)
50
+ end
51
+
52
+ # Checks if a peptide satisfies the min length, max length, lacks and contains requirements.
53
+ # Returns true if
54
+ # - the peptide length is equal or higher than min
55
+ # - the peptide length is equal or lower than max
56
+ # - the peptide doesn't contain any of the amino acids in lacks
57
+ # - the peptide contains all of the amino acids in contains
58
+ #
59
+ # @param [String] peptide The peptide to check
60
+ #
61
+ # @param [Integer] min The minimal length requirement
62
+ #
63
+ # @param [Integer] max The maximal length requirement
64
+ #
65
+ # @param [Array<String>] lacks The forbidden amino acids
66
+ #
67
+ # @param [Array<String>] contains The required amino acids
68
+ #
69
+ # @return [Boolean] true if the peptide satisfies all requirements
70
+ def self.filter(peptide, min, max, lacks, contains)
71
+ filter_length(peptide, min, max) &&
72
+ filter_lacks(peptide, lacks) &&
73
+ filter_contains(peptide, contains)
74
+ end
75
+
76
+ # Checks if a peptide satisfies the min length and max length requirements.
77
+ # Returns true if
78
+ # - the peptide length is equal or higher than min
79
+ # - the peptide length is equal or lower than max
80
+ #
81
+ # @param [String] peptide The peptide to check
82
+ #
83
+ # @param [Integer] min The minimal length requirement
84
+ #
85
+ # @param [Integer] max The maximal length requirement
86
+ #
87
+ # @return [Boolean] true if the peptide satisfies all requirements
88
+ def self.filter_length(peptide, min, max)
89
+ peptide.length >= min && peptide.length <= max
90
+ end
91
+
92
+ # Checks if a peptide satisfies lacks requirement.
93
+ # Returns true if
94
+ # - the peptide doesn't contain any of the amino acids in lacks
95
+ #
96
+ # @param [String] peptide The peptide to check
97
+ #
98
+ # @param [Array<String>] lacks The forbidden amino acids
99
+ #
100
+ # @return [Boolean] true if the peptide satisfies all requirements
101
+ def self.filter_lacks(peptide, lacks)
102
+ (peptide.chars.to_a & lacks).size == 0
103
+ end
104
+
105
+ # Checks if a peptide satisfies the contains requirement.
106
+ # Returns true if
107
+ # - the peptide contains all of the amino acids in contains
108
+ #
109
+ # @param [String] peptide The peptide to check
110
+ #
111
+ # @param [Array<String>] contains The required amino acids
112
+ #
113
+ # @return [Boolean] true if the peptide satisfies all requirements
114
+ def self.filter_contains(peptide, contains)
115
+ (peptide.chars.to_a & contains).size == contains.size
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,61 @@
1
+ module Unipept::Commands
2
+ class Prot2pept
3
+ attr_reader :root_command
4
+ attr_reader :valid_formats
5
+
6
+ @root_command = Cri::Command.define do
7
+ name 'prot2pept'
8
+ summary 'Split protein sequences into peptides.'
9
+ usage 'prot2pept [options]'
10
+ description <<-EOS
11
+ The prot2pept command splits each protein sequence into a list of peptides according to a given cleavage-pattern. The command expects a list of protein sequences that are passed to standard input.
12
+
13
+ The input should have either one protein sequence per line or contain a FASTA formatted list of protein sequences. FASTA headers are preserved in the output, so that peptides can be bundled per protein sequence.
14
+
15
+ EOS
16
+ required :p, :pattern, 'specify cleavage-pattern (regex) as the pattern after which the next peptide will be cleaved (default: ([KR])([^P]) for tryptic peptides).'
17
+ flag :h, :help, 'show help for this command' do |_value, cmd|
18
+ puts cmd.help
19
+ exit 0
20
+ end
21
+ run do |opts, _args, _cmd|
22
+ pattern = opts.fetch(:pattern, '([KR])([^P])')
23
+
24
+ # decide if we have FASTA input
25
+ first_char = $stdin.getc
26
+ $stdin.ungetc(first_char)
27
+ if first_char == '>'
28
+ # fasta mode!
29
+ protein = ''
30
+ while (line = $stdin.gets)
31
+ if line.start_with? '>'
32
+ puts Prot2pept.split(protein, pattern)
33
+ protein = ''
34
+ puts line
35
+ else
36
+ protein += line.chomp
37
+ end
38
+ end
39
+ puts Prot2pept.split(protein, pattern)
40
+ else
41
+ $stdin.each_line do |prot|
42
+ puts Prot2pept.split(prot, pattern)
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+ def self.split(protein, pattern)
49
+ protein.gsub(/#{pattern}/, "\\1\n\\2").gsub(/#{pattern}/, "\\1\n\\2").split("\n").reject(&:empty?)
50
+ end
51
+
52
+ # Invokes the uniprot command-line tool with the given arguments.
53
+ #
54
+ # @param [Array<String>] args An array of command-line arguments
55
+ #
56
+ # @return [void]
57
+ def self.run(args)
58
+ @root_command.run(args)
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,199 @@
1
+ module Unipept
2
+ class Commands::ApiRunner < Cri::CommandRunner
3
+ attr_reader :configuration
4
+
5
+ attr_reader :url
6
+
7
+ attr_reader :message_url
8
+
9
+ attr_reader :user_agent
10
+
11
+ def initialize(args, opts, cmd)
12
+ super
13
+ @configuration = Unipept::Configuration.new
14
+ set_configuration
15
+
16
+ @url = "#{@host}/api/v1/#{cmd.name}.json"
17
+ @message_url = "#{@host}/api/v1/messages.json"
18
+ end
19
+
20
+ # Sets the configurable options of the command line app:
21
+ # - the host
22
+ # - the user agent
23
+ def set_configuration
24
+ @host = host
25
+ @user_agent = 'Unipept CLI - unipept ' + Unipept::VERSION
26
+ end
27
+
28
+ # Returns the host. If a value is defined by both an option and the config
29
+ # file, the value of the option is used.
30
+ def host
31
+ # find host in opts first
32
+ host = options[:host] ? options[:host] : @configuration['host']
33
+
34
+ # No host has been set?
35
+ if host.nil? || host.empty?
36
+ abort 'WARNING: no host has been set, you can set the host with `unipept config host http://api.unipept.ugent.be/`'
37
+ end
38
+
39
+ # add http:// if needed
40
+ if host.start_with?('http://') || host.start_with?('https://')
41
+ host
42
+ else
43
+ "http://#{host}"
44
+ end
45
+ end
46
+
47
+ # Returns an input iterator to use for the request.
48
+ # - if arguments are given, uses arguments
49
+ # - if the input file option is given, uses file input
50
+ # - if none of the previous are given, uses stdin
51
+ def input_iterator
52
+ return arguments.each unless arguments.empty?
53
+ return IO.foreach(options[:input]) if options[:input]
54
+ $stdin.each_line
55
+ end
56
+
57
+ # Returns the default batch_size of a command.
58
+ def batch_size
59
+ 100
60
+ end
61
+
62
+ # Constructs a request body (a Hash) for set of input strings, using the
63
+ # options supplied by the user.
64
+ def construct_request_body(input)
65
+ names = selected_fields.empty? || selected_fields.any? { |f| f.to_s.include? 'name' }
66
+ { input: input,
67
+ equate_il: options[:equate] == true,
68
+ extra: options[:all] == true,
69
+ names: options[:all] == true && names
70
+ }
71
+ end
72
+
73
+ # Returns an array of regular expressions containing all the selected fields
74
+ def selected_fields
75
+ @selected_fields ||= [*options[:select]].map { |f| f.split(',') }.flatten.map { |f| glob_to_regex(f) }
76
+ end
77
+
78
+ # Returns a formatter, based on the format specified in the options
79
+ def formatter
80
+ @formatter ||= Unipept::Formatter.new_for_format(options[:format])
81
+ end
82
+
83
+ # Checks if the server has a message and prints it if not empty.
84
+ # We will only check this once a day and won't print anything if the quiet
85
+ # option is set or if we output to a file.
86
+ def print_server_message
87
+ return if options[:quiet]
88
+ return unless $stdout.tty?
89
+ return if recently_fetched?
90
+ @configuration['last_fetch_date'] = Time.now
91
+ @configuration.save
92
+ resp = fetch_server_message
93
+ puts resp unless resp.empty?
94
+ end
95
+
96
+ # Fetches a message from the server and returns it
97
+ def fetch_server_message
98
+ Typhoeus.get(@message_url, params: { version: Unipept::VERSION }).body.chomp
99
+ end
100
+
101
+ # Returns true if the last check for a server message was less than a day
102
+ # ago.
103
+ def recently_fetched?
104
+ last_fetched = @configuration['last_fetch_date']
105
+ !last_fetched.nil? && (last_fetched + 60 * 60 * 24) > Time.now
106
+ end
107
+
108
+ # Returns a new batch_iterator based on the batch_size
109
+ def batch_iterator
110
+ Unipept::BatchIterator.new(batch_size)
111
+ end
112
+
113
+ # Runs the command
114
+ def run
115
+ print_server_message
116
+ hydra = Typhoeus::Hydra.new(max_concurrency: 10)
117
+ batch_order = Unipept::BatchOrder.new
118
+
119
+ batch_iterator.iterate(input_iterator) do |input_slice, batch_id, fasta_mapper|
120
+ request = Typhoeus::Request.new(
121
+ @url,
122
+ method: :post,
123
+ body: construct_request_body(input_slice),
124
+ accept_encoding: 'gzip',
125
+ headers: { 'User-Agent' => @user_agent }
126
+ )
127
+
128
+ request.on_complete do |resp|
129
+ block = handle_response(resp, batch_id, fasta_mapper)
130
+ batch_order.wait(batch_id, &block)
131
+ end
132
+
133
+ hydra.queue request
134
+ hydra.run if batch_id % 200 == 0
135
+ end
136
+
137
+ hydra.run
138
+ end
139
+
140
+ # Saves an error to a new file in the .unipept directory in the users home
141
+ # directory.
142
+ def save_error(message)
143
+ path = error_file_path
144
+ FileUtils.mkdir_p File.dirname(path)
145
+ File.open(path, 'w') { |f| f.write message }
146
+ $stderr.puts "API request failed! log can be found in #{path}"
147
+ end
148
+
149
+ # Write a string to the output defined by the command. If a file is given,
150
+ # write it to the file. If not, write to stdout
151
+ def write_to_output(string)
152
+ if options[:output]
153
+ File.open(options[:output], 'a') { |f| f.write string }
154
+ else
155
+ puts string
156
+ end
157
+ end
158
+
159
+ private
160
+
161
+ def error_file_path
162
+ File.expand_path(File.join(Dir.home, '.unipept', "unipept-#{Time.now.strftime('%F-%T')}.log"))
163
+ end
164
+
165
+ # Handles the response of an API request.
166
+ # Returns a block to execute.
167
+ def handle_response(response, batch_id, fasta_mapper)
168
+ if response.success?
169
+ result = filter_result(response.response_body)
170
+
171
+ lambda do
172
+ unless result.empty?
173
+ write_to_output formatter.header(result, fasta_mapper) if batch_id == 0
174
+ write_to_output formatter.format(result, fasta_mapper)
175
+ end
176
+ end
177
+ elsif response.timed_out?
178
+ -> { save_error('request timed out, continuing anyway, but results might be incomplete') }
179
+ elsif response.code == 0
180
+ -> { save_error('could not get an http response, continuing anyway, but results might be incomplete' + response.return_message) }
181
+ else
182
+ -> { save_error("Got #{response.code}: #{response.response_body}\nRequest headers: #{response.request.options}\nRequest body:\n#{response.request.encoded_body}\n\n") }
183
+ end
184
+ end
185
+
186
+ # Parses the json_response, wraps it in an array if needed and filters the
187
+ # fields based on the selected_fields
188
+ def filter_result(json_response)
189
+ result = JSON[json_response] rescue []
190
+ result = [result] unless result.is_a? Array
191
+ result.map! { |r| r.select! { |k, _v| selected_fields.any? { |f| f.match k } } } unless selected_fields.empty?
192
+ result
193
+ end
194
+
195
+ def glob_to_regex(string)
196
+ /^#{string.gsub('*', '.*')}$/
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,29 @@
1
+ module Unipept
2
+ class Commands::Config < Cri::CommandRunner
3
+ def run
4
+ abort command.help if arguments.size == 0 || arguments.size > 2
5
+
6
+ key, value = *arguments
7
+
8
+ if arguments.size == 2
9
+ set_config(key, value)
10
+ puts key + ' was set to ' + value
11
+ else
12
+ puts get_config(key)
13
+ end
14
+ end
15
+
16
+ def config
17
+ @config ||= Unipept::Configuration.new
18
+ end
19
+
20
+ def set_config(key, value)
21
+ config[key] = value
22
+ config.save
23
+ end
24
+
25
+ def get_config(key)
26
+ config[key]
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,12 @@
1
+ require_relative 'api_runner'
2
+ module Unipept::Commands
3
+ class Pept2lca < ApiRunner
4
+ def batch_size
5
+ if options[:all]
6
+ 100
7
+ else
8
+ 1000
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,13 @@
1
+ require_relative 'api_runner'
2
+
3
+ module Unipept::Commands
4
+ class Pept2prot < ApiRunner
5
+ def batch_size
6
+ if options[:all]
7
+ 5
8
+ else
9
+ 10
10
+ end
11
+ end
12
+ end
13
+ end
@@ -1,5 +1,12 @@
1
1
  require_relative 'api_runner'
2
2
  module Unipept::Commands
3
3
  class Pept2taxa < ApiRunner
4
+ def batch_size
5
+ if options[:all]
6
+ 5
7
+ else
8
+ 10
9
+ end
10
+ end
4
11
  end
5
12
  end
@@ -0,0 +1,18 @@
1
+ require_relative 'api_runner'
2
+ module Unipept::Commands
3
+ class Taxa2lca < ApiRunner
4
+ def batch_iterator
5
+ SimpleBatchIterator.new
6
+ end
7
+
8
+ def batch_size
9
+ fail 'NOT NEEDED FOR TAXA2LCA'
10
+ end
11
+ end
12
+
13
+ class SimpleBatchIterator
14
+ def iterate(input, &block)
15
+ block.call(input.to_a, 0)
16
+ end
17
+ end
18
+ end
@@ -1,5 +1,8 @@
1
1
  require_relative 'api_runner'
2
2
  module Unipept::Commands
3
3
  class Taxonomy < ApiRunner
4
+ def batch_size
5
+ 100
6
+ end
4
7
  end
5
8
  end