unipept 0.7.1 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +26 -0
  3. data/.travis.yml +7 -0
  4. data/Gemfile +8 -10
  5. data/Gemfile.lock +35 -21
  6. data/README.md +6 -4
  7. data/Rakefile +11 -10
  8. data/VERSION +1 -1
  9. data/bin/peptfilter +2 -44
  10. data/bin/prot2pept +4 -49
  11. data/bin/unipept +2 -197
  12. data/bin/uniprot +4 -53
  13. data/lib/batch_iterator.rb +73 -0
  14. data/lib/batch_order.rb +20 -0
  15. data/lib/commands/peptfilter.rb +118 -0
  16. data/lib/commands/prot2pept.rb +61 -0
  17. data/lib/commands/unipept/api_runner.rb +199 -0
  18. data/lib/commands/unipept/config.rb +29 -0
  19. data/lib/commands/unipept/pept2lca.rb +12 -0
  20. data/lib/commands/unipept/pept2prot.rb +13 -0
  21. data/lib/{unipept/commands → commands/unipept}/pept2taxa.rb +7 -0
  22. data/lib/commands/unipept/taxa2lca.rb +18 -0
  23. data/lib/{unipept/commands → commands/unipept}/taxonomy.rb +3 -0
  24. data/lib/commands/unipept.rb +226 -0
  25. data/lib/commands/uniprot.rb +69 -0
  26. data/lib/commands.rb +10 -0
  27. data/lib/configuration.rb +45 -0
  28. data/lib/formatters.rb +252 -0
  29. data/lib/version.rb +3 -0
  30. data/test/commands/test_peptfilter.rb +170 -0
  31. data/test/commands/test_prot2pept.rb +82 -0
  32. data/test/commands/test_unipept.rb +37 -0
  33. data/test/commands/test_uniprot.rb +136 -0
  34. data/test/commands/unipept/test_api_runner.rb +486 -0
  35. data/test/commands/unipept/test_config.rb +64 -0
  36. data/test/commands/unipept/test_pept2lca.rb +40 -0
  37. data/test/commands/unipept/test_pept2prot.rb +39 -0
  38. data/test/commands/unipept/test_pept2taxa.rb +39 -0
  39. data/test/commands/unipept/test_taxa2lca.rb +39 -0
  40. data/test/commands/unipept/test_taxonomy.rb +37 -0
  41. data/test/helper.rb +69 -23
  42. data/test/test_bach_order.rb +57 -0
  43. data/test/test_base.rb +6 -0
  44. data/test/test_batch_iterator.rb +87 -0
  45. data/test/test_configuration.rb +43 -0
  46. data/test/test_formatters.rb +140 -0
  47. data/unipept.gemspec +55 -33
  48. metadata +62 -40
  49. data/lib/unipept/batch_order.rb +0 -28
  50. data/lib/unipept/commands/api_runner.rb +0 -239
  51. data/lib/unipept/commands/pept2lca.rb +0 -6
  52. data/lib/unipept/commands/pept2prot.rb +0 -20
  53. data/lib/unipept/commands/taxa2lca.rb +0 -12
  54. data/lib/unipept/commands.rb +0 -7
  55. data/lib/unipept/configuration.rb +0 -29
  56. data/lib/unipept/formatters.rb +0 -135
  57. data/lib/unipept/version.rb +0 -3
  58. data/lib/unipept.rb +0 -8
  59. data/test/test_unipept.rb +0 -7
@@ -0,0 +1,226 @@
1
+ require 'typhoeus'
2
+
3
+ require_relative '../formatters'
4
+ require_relative '../configuration'
5
+ require_relative '../batch_order'
6
+ require_relative '../batch_iterator'
7
+ require_relative '../version'
8
+
9
+ require_relative 'unipept/config'
10
+ require_relative 'unipept/pept2lca'
11
+ require_relative 'unipept/pept2prot'
12
+ require_relative 'unipept/pept2taxa'
13
+ require_relative 'unipept/taxa2lca'
14
+ require_relative 'unipept/taxonomy'
15
+
16
+ module Unipept
17
+ class Commands::Unipept
18
+ def initialize
19
+ @root_command = create_root_command
20
+ add_config_command
21
+ add_pept2taxa_command
22
+ add_pept2lca_command
23
+ add_taxa2lca_command
24
+ add_pept2prot_command
25
+ add_taxonomy_command
26
+ end
27
+
28
+ def run(args)
29
+ @root_command.run(args)
30
+ end
31
+
32
+ def create_root_command
33
+ Cri::Command.new_basic_root.modify do
34
+ name 'unipept'
35
+ summary 'Command line interface to Unipept web services.'
36
+ usage 'unipept subcommand [options]'
37
+ description <<-EOS
38
+ The unipept subcommands are command line wrappers around the Unipept web services.
39
+
40
+ Subcommands that start with pept expect a list of tryptic peptides as input. Subcommands that start with tax expect a list of NCBI Taxonomy Identifiers as input. Input is passed
41
+
42
+ - as separate command line arguments
43
+
44
+ - in a text file that is passed as an argument to the -i option
45
+
46
+ - to standard input
47
+
48
+ The command will give priority to the first way the input is passed, in the order as listed above. Text files and standard input should have one tryptic peptide or one NCBI Taxonomy Identifier per line.
49
+ EOS
50
+ flag :v, :version, 'displays the version'
51
+ flag :q, :quiet, 'disable service messages'
52
+ option :i, :input, 'read input from file', argument: :required
53
+ option :o, :output, 'write output to file', argument: :required
54
+ option :f, :format, "define the output format (available: #{Unipept::Formatter.available.join ', ' }) (default: #{Unipept::Formatter.default})", argument: :required
55
+
56
+ # Configuration options
57
+ option nil, 'host', 'specify the server running the Unipept web service', argument: :required
58
+
59
+ run do |opts, _args, cmd|
60
+ if opts[:version]
61
+ puts Unipept::VERSION
62
+ else
63
+ abort cmd.help
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ def add_config_command
70
+ @root_command.define_command('config') do
71
+ summary 'Set configuration options.'
72
+ usage 'config option [value]'
73
+ description <<-EOS
74
+ Sets or shows the value for configuration options. All settings are stored in the .unipeptrc file in the home directory of the user.
75
+
76
+ Running the command with a value will set that value for the given option, running it without will show the current value.
77
+
78
+ These options are currently supported:
79
+
80
+ - host: Set the default host for api calls.
81
+
82
+ Example: "unipept config host http://api.unipept.ugent.be" will set the default host to the public unipept server.
83
+ EOS
84
+
85
+ runner Commands::Config
86
+ end
87
+ end
88
+
89
+ def add_pept2taxa_command
90
+ @root_command.define_command('pept2taxa') do
91
+ usage 'pept2taxa [options]'
92
+ summary 'Fetch taxa of Uniprot records that match tryptic peptides.'
93
+ description <<-EOS
94
+ For each tryptic peptide the unipept pept2taxa command retrieves from Unipept the set of taxa from all Uniprot records whose protein sequence contains an exact matches to the tryptic peptide. The command expects a list of tryptic peptides that are passed
95
+
96
+ - as separate command line arguments
97
+
98
+ - in a text file that is passed as an argument to the -i option
99
+
100
+ - to standard input
101
+
102
+ The command will give priority to the first way tryptic peptides are passed, in the order as listed above. Text files and standard input should have one tryptic peptide per line.
103
+
104
+ The unipept pept2taxa subcommand yields NCBI Taxonomy records as output.
105
+ EOS
106
+
107
+ flag :e, :equate, 'equate isoleucine (I) and leucine (L) when matching peptides'
108
+ flag :a, :all, 'report all information fields of NCBI Taxonomy records available in Unipept. Note that this may have a performance penalty.'
109
+ option :s, :select, 'select the information fields to return. Selected fields are passed as a comma separated list of field names. Multiple -s (or --select) options may be used.', argument: :required, multiple: true
110
+
111
+ runner Commands::Pept2taxa
112
+ end
113
+ end
114
+
115
+ def add_pept2lca_command
116
+ @root_command.define_command('pept2lca') do
117
+ usage 'pept2lca [options]'
118
+ summary 'Fetch taxonomic lowest common ancestor of Uniprot records that match tryptic peptides.'
119
+ description <<-EOS
120
+ For each tryptic peptide the unipept pept2lca command retrieves from Unipept the lowest common ancestor of the set of taxa from all Uniprot records whose protein sequence contains an exact matches to the tryptic peptide. The lowest common ancestor is based on the topology of the Unipept Taxonomy -- a cleaned up version of the NCBI Taxonomy -- and is itself a record from the NCBI Taxonomy. The command expects a list of tryptic peptides that are passed
121
+
122
+ - as separate command line arguments
123
+
124
+ - in a text file that is passed as an argument to the -i option
125
+
126
+ - to standard input
127
+
128
+ The command will give priority to the first way tryptic peptides are passed, in the order as listed above. Text files and standard input should have one tryptic peptide per line.
129
+
130
+ The unipept pept2lca subcommand yields an NCBI Taxonomy record as output.
131
+ EOS
132
+
133
+ flag :e, :equate, 'equate isoleucine (I) and leucine (L) when matching peptides'
134
+ flag :a, :all, 'report all information fields of NCBI Taxonomy records available in Unipept. Note that this may have a performance penalty.'
135
+ option :s, :select, 'select the information fields to return. Selected fields are passed as a comma separated list of field names. Multiple -s (or --select) options may be used.', argument: :required, multiple: true
136
+
137
+ runner Commands::Pept2lca
138
+ end
139
+ end
140
+
141
+ def add_taxa2lca_command
142
+ @root_command.define_command('taxa2lca') do
143
+ usage 'taxa2lca [options]'
144
+ summary 'Compute taxonomic lowest common ancestor for given list of taxa.'
145
+ description <<-EOS
146
+ The unipept taxa2lca command computes the lowest common ancestor of a given list of NCBI Taxonomy Identifiers. The lowest common ancestor is based on the topology of the Unipept Taxonomy -- a cleaned up version of the NCBI Taxonomy -- and is itself a record from the NCBI Taxonomy. The command expects a list of NCBI Taxonomy Identifiers that are passed
147
+
148
+ - as separate command line arguments
149
+
150
+ - in a text file that is passed as an argument to the -i option
151
+
152
+ - to standard input
153
+
154
+ The command will give priority to the first way NCBI Taxonomy Identifiers are passed, in the order as listed above. Text files and standard input should have one NCBI Taxonomy Identifier per line.
155
+
156
+ The unipept taxonomy subcommand yields NCBI Taxonomy records as output.
157
+ EOS
158
+
159
+ flag :a, :all, 'report all information fields of NCBI Taxonomy records available in Unipept. Note that this may have a performance penalty.'
160
+ option :s, :select, 'select the information fields to return. Selected fields are passed as a comma separated list of field names. Multiple -s (or --select) options may be used.', argument: :required, multiple: true
161
+
162
+ runner Commands::Taxa2lca
163
+ end
164
+ end
165
+
166
+ def add_pept2prot_command
167
+ @root_command.define_command('pept2prot') do
168
+ usage 'pept2prot [options]'
169
+ summary 'Fetch Uniprot records that match tryptic peptides.'
170
+ description <<-EOS
171
+ For each tryptic peptide the unipept pept2prot command retrieves from Unipept all Uniprot records whose protein sequence contains an exact matches to the tryptic peptide. The command expects a list of tryptic peptides that are passed
172
+
173
+ - as separate command line arguments
174
+
175
+ - in a text file that is passed as an argument to the -i option
176
+
177
+ - to standard input
178
+
179
+ The command will give priority to the first way tryptic peptides are passed, in the order as listed above. Text files and standard input should have one tryptic peptide per line.
180
+
181
+ The unipept pept2prot subcommand yields Uniprot records as output.
182
+ EOS
183
+
184
+ flag :e, :equate, 'equate isoleucine (I) and leucine (L) when matching peptides'
185
+ flag :a, :all, 'report all information fields of Uniprot records available in Unipept. Note that this may have a performance penalty.'
186
+ option :s, :select, 'select the information fields to return. Selected fields are passed as a comma separated list of field names. Multiple -s (or --select) options may be used.', argument: :required, multiple: true
187
+
188
+ runner Commands::Pept2prot
189
+ end
190
+ end
191
+
192
+ def add_taxonomy_command
193
+ @root_command.define_command('taxonomy') do
194
+ usage 'taxonomy [options]'
195
+ summary 'Fetch taxonomic information from Unipept Taxonomy.'
196
+ description <<-EOS
197
+ The unipept taxonomy command yields information from the Unipept Taxonomy records for a given list of NCBI Taxonomy Identifiers. The Unipept Taxonomy is a cleaned up version of the NCBI Taxonomy, and its records are also records of the NCBI Taxonomy. The command expects a list of NCBI Taxonomy Identifiers that are passed
198
+
199
+ - as separate command line arguments
200
+
201
+ - in a text file that is passed as an argument to the -i option
202
+
203
+ - to standard input
204
+
205
+ The command will give priority to the first way NCBI Taxonomy Identifiers are passed, in the order as listed above. Text files and standard input should have one NCBI Taxonomy Identifier per line.
206
+
207
+ The unipept taxonomy subcommand yields NCBI Taxonomy records as output.
208
+ EOS
209
+
210
+ flag :a, :all, 'report all information fields of NCBI Taxonomy records available in Unipept. Note that this may have a performance penalty.'
211
+ option :s, :select, 'select the information fields to return. Selected fields are passed as a comma separated list of field names. Multiple -s (or --select) options may be used.', argument: :required, multiple: true
212
+
213
+ runner Commands::Taxonomy
214
+ end
215
+ end
216
+
217
+ # Invokes the unipept command-line tool with the given arguments.
218
+ #
219
+ # @param [Array<String>] args An array of command-line arguments
220
+ #
221
+ # @return [void]
222
+ def self.run(args)
223
+ new.run(args)
224
+ end
225
+ end
226
+ end
@@ -0,0 +1,69 @@
1
+ require 'typhoeus'
2
+
3
+ module Unipept::Commands
4
+ class Uniprot
5
+ attr_reader :root_command
6
+ attr_reader :valid_formats
7
+
8
+ valid_formats = Set.new %w(fasta txt xml rdf gff sequence)
9
+ @root_command = Cri::Command.define do
10
+ name 'uniprot'
11
+ summary 'Command line interface to Uniprot web services.'
12
+ usage 'uniprot [options]'
13
+ description <<-EOS
14
+ The uniprot command is a command line wrapper around the Uniprot web services. The command expects a list of Uniprot Accession Numbers that are passed
15
+
16
+ - as separate command line arguments
17
+
18
+ - to standard input
19
+
20
+ The command will give priority to the first way Uniprot Accession Numbers are passed, in the order as listed above. The standard input should have one Uniprot Accession Number per line.
21
+
22
+ The uniprot command yields just the protein sequences as a default, but can return several formats.
23
+ EOS
24
+ required :f, :format, 'specify output format (available: ' + valid_formats.to_a.join(', ') + ') (default: sequence)'
25
+ flag :h, :help, 'show help for this command' do |_value, cmd|
26
+ puts cmd.help
27
+ exit 0
28
+ end
29
+ run do |opts, args, _cmd|
30
+ format = opts.fetch(:format, 'sequence')
31
+ unless valid_formats.include? format
32
+ $stderr.puts format + ' is not a valid output format. Available formats are: ' + valid_formats.to_a.join(', ')
33
+ exit 1
34
+ end
35
+ iterator = args.empty? ? $stdin.each_line : args
36
+ iterator.each do |accession|
37
+ puts Uniprot.get_uniprot_entry(accession.chomp, format)
38
+ end
39
+ end
40
+ end
41
+
42
+ # Invokes the uniprot command-line tool with the given arguments.
43
+ #
44
+ # @param [Array<String>] args An array of command-line arguments
45
+ #
46
+ # @return [void]
47
+ def self.run(args)
48
+ @root_command.run(args)
49
+ end
50
+
51
+ # Fetches a Uniprot record from the uniprot website with the given accession
52
+ # number in the requested format.
53
+ #
54
+ # @param [String] accession The accession number of the record to fetch
55
+ #
56
+ # @param [String] format The format of of the record. If the format is 'sequence', the sequence will be returned in as a single line
57
+ #
58
+ # @return [String] The requested Uniprot record in the requested format
59
+ def self.get_uniprot_entry(accession, format)
60
+ if format == 'sequence'
61
+ get_uniprot_entry(accession, 'fasta').lines.map(&:chomp)[1..-1].join('')
62
+ else
63
+ # other format has been specified, just download and output
64
+ resp = Typhoeus.get("http://www.uniprot.org/uniprot/#{accession}.#{format}")
65
+ resp.response_body if resp.success?
66
+ end
67
+ end
68
+ end
69
+ end
data/lib/commands.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'cri'
2
+
3
+ module Unipept
4
+ module Commands
5
+ require_relative 'commands/peptfilter'
6
+ require_relative 'commands/prot2pept'
7
+ require_relative 'commands/uniprot'
8
+ require_relative 'commands/unipept'
9
+ end
10
+ end
@@ -0,0 +1,45 @@
1
+ require 'yaml'
2
+
3
+ module Unipept
4
+ class Configuration
5
+ attr_reader :config
6
+ attr_reader :file_name
7
+
8
+ # Creates a new config object, based on a given YAML file. If no filename
9
+ # given, '.unipeptrc' in the home dir of the user will be used.
10
+ #
11
+ # If the file doesn't exist, an empty config will be loaded.
12
+ #
13
+ # @param [String] file An optional file name of the YAML file to create the
14
+ # config from
15
+ def initialize(file = nil)
16
+ @file_name = file ? file : File.join(Dir.home, '.unipeptrc')
17
+ if !File.exist? file_name
18
+ @config = {}
19
+ else
20
+ @config = YAML.load_file file_name
21
+ end
22
+ end
23
+
24
+ # Saves the config to disk. If the file doesn't exist yet, a new one will be
25
+ # created
26
+ def save
27
+ File.open(file_name, 'w') { |f| f.write config.to_yaml }
28
+ end
29
+
30
+ # Deletes a key
31
+ def delete(key)
32
+ config.delete(key)
33
+ end
34
+
35
+ # forwards [] to the internal config hash
36
+ def [](*args)
37
+ config.[](*args)
38
+ end
39
+
40
+ # forwards =[] to the internal config hash
41
+ def []=(*args)
42
+ config.[]=(*args)
43
+ end
44
+ end
45
+ end
data/lib/formatters.rb ADDED
@@ -0,0 +1,252 @@
1
+ require 'json'
2
+
3
+ module Unipept
4
+ class Formatter
5
+ # The Hash of available formatters
6
+ #
7
+ # @return [Hash] A hash of the available formatters
8
+ def self.formatters
9
+ @@formatters ||= {}
10
+ end
11
+
12
+ # Returns a new formatter of the given format. If the given format is not available, the
13
+ # default formatter is returned
14
+ #
15
+ # @param [String] format The type of the formatter we want
16
+ #
17
+ # @return [Formatter] The requested formatter
18
+ def self.new_for_format(format)
19
+ formatters[format].new
20
+ rescue
21
+ formatters[default].new
22
+ end
23
+
24
+ # Adds a new formatter to the list of available formats
25
+ #
26
+ # @param [Symbol] format The type of the format we want to register
27
+ def self.register(format)
28
+ formatters[format.to_s] = self
29
+ end
30
+
31
+ # Returns a list of the available formatters
32
+ #
33
+ # @return [Array<String>] The list of available formatters
34
+ def self.available
35
+ formatters.keys
36
+ end
37
+
38
+ # @return [String] The type of the default formatter: csv
39
+ def self.default
40
+ 'csv'
41
+ end
42
+
43
+ # @return [String] The type of the current formatter
44
+ def type
45
+ ''
46
+ end
47
+
48
+ # Returns the header row for the given sample_data and fasta_mapper. This
49
+ # row is output only once at the beginning of the output
50
+ #
51
+ # @param [Object] _sample_data The data that we will output after this
52
+ # header. Can be used to extract the keys.
53
+ #
54
+ # @param [Array<Array<String>>] _fasta_mapper Optional mapping between input
55
+ # data and corresponding fasta header. The data is represented as a list
56
+ # containing tuples where the first element is the fasta header and second
57
+ # element is the input data
58
+ #
59
+ # @return [String] The header row
60
+ def header(_sample_data, _fasta_mapper = nil)
61
+ ''
62
+ end
63
+
64
+ # Converts the given input data and corresponding fasta headers to another
65
+ # format.
66
+ #
67
+ # @param [Array] data The data we wish to convert
68
+ #
69
+ # @param [Array<Array<String>>] _fasta_mapper Optional mapping between input
70
+ # data and corresponding fasta header. The data is represented as a list
71
+ # containing tuples where the first element is the fasta header and second
72
+ # element is the input data
73
+ #
74
+ # @return [String] The converted input data
75
+ def format(data, _fasta_mapper = nil)
76
+ data
77
+ end
78
+ end
79
+
80
+ class JSONFormatter < Formatter
81
+ require 'json'
82
+ register :json
83
+
84
+ # @return [String] The type of the current formatter: json
85
+ def type
86
+ 'json'
87
+ end
88
+
89
+ # Converts the given input data and corresponding fasta headers to JSON.
90
+ # Currently ignores the fasta_mapper.
91
+ #
92
+ # @param [Array] data The data we wish to convert
93
+ #
94
+ # @param [Array<Array<String>>] _fasta_mapper Optional mapping between input
95
+ # data and corresponding fasta header. The data is represented as a list
96
+ # containing tuples where the first element is the fasta header and second
97
+ # element is the input data
98
+ #
99
+ # @return [String] The input data converted to the JSON format.
100
+ def format(data, _fasta_mapper = nil)
101
+ # TODO: add fasta header based on fasta_mapper information
102
+ data.to_json
103
+ end
104
+ end
105
+
106
+ class CSVFormatter < Formatter
107
+ require 'csv'
108
+ register :csv
109
+
110
+ # @return [String] The type of the current formatter: csv
111
+ def type
112
+ 'csv'
113
+ end
114
+
115
+ # Returns the header row for the given data and fasta_mapper. This row
116
+ # contains all the keys of the first element of the data, preceded by
117
+ # 'fasta_header' if a fasta_mapper is given.
118
+ #
119
+ # @param [Array] data The data that we will use to extract the keys from.
120
+ #
121
+ # @param [Array<Array<String>>] fasta_mapper Optional mapping between input
122
+ # data and corresponding fasta header. The data is represented as a list
123
+ # containing tuples where the first element is the fasta header and second
124
+ # element is the input data If a fasta_mapper is given, the output will be
125
+ # preceded with 'fasta_header'.
126
+ #
127
+ # @return [String] The header row
128
+ def header(data, fasta_mapper = nil)
129
+ CSV.generate do |csv|
130
+ first = data.first
131
+ keys = fasta_mapper ? ['fasta_header'] : []
132
+ csv << (keys + first.keys).map(&:to_s) if first
133
+ end
134
+ end
135
+
136
+ # Converts the given input data and corresponding fasta headers to the csv
137
+ # format
138
+ #
139
+ # @param [Array] data The data we wish to convert
140
+ #
141
+ # @param [Array<Array<String>>] fasta_mapper Optional mapping between input
142
+ # data and corresponding fasta header. The data is represented as a list
143
+ # containing tuples where the first element is the fasta header and second
144
+ # element is the input data
145
+ #
146
+ # @return [String] The converted input data into the csv format
147
+ def format(data, fasta_mapper = nil)
148
+ CSV.generate do |csv|
149
+ if fasta_mapper
150
+ format_fasta(csv, data, fasta_mapper)
151
+ else
152
+ format_normal(csv, data)
153
+ end
154
+ end
155
+ end
156
+
157
+ # Converts the given input data and corresponding fasta headers to the csv
158
+ # format
159
+ #
160
+ # @param [CSV] csv object we write the csv output to
161
+ #
162
+ # @param [Array] data The data we wish to convert
163
+ #
164
+ # @return [String] The converted input data into the csv format
165
+ def format_normal(csv, data)
166
+ data.each do |o|
167
+ csv << o.values.map { |v| v == '' ? nil : v }
168
+ end
169
+ end
170
+
171
+ # Converts the given input data and corresponding fasta headers to the csv
172
+ # format
173
+ #
174
+ # @param [CSV] csv object we write the csv output to
175
+ #
176
+ # @param [Array] data The data we wish to convert
177
+ #
178
+ # @param [Array<Array<String>>] fasta_mapper Optional mapping between input
179
+ # data and corresponding fasta header. The data is represented as a list
180
+ # containing tuples where the first element is the fasta header and second
181
+ # element is the input data
182
+ #
183
+ # @return [String] The converted input data into the csv format
184
+ def format_fasta(csv, data, fasta_mapper)
185
+ data_dict = group_by_first_key(data)
186
+ fasta_mapper.each do |fasta_header, key|
187
+ next if data_dict[key].nil?
188
+
189
+ data_dict[key].each do |r|
190
+ csv << ([fasta_header] + r.values).map { |v| v == '' ? nil : v }
191
+ end
192
+ end
193
+ end
194
+
195
+ # Groups the data by the first key of each element, for example
196
+ # [{key1: v1, key2: v2},{key1: v1, key2: v3},{key1: v4, key2: v2}]
197
+ # to {v1 => [{key1: v1, key2: v2},{key1: v1, key2: v3}], v4 => [{key1: v4, key2: v2}]]
198
+ #
199
+ # @param [Array<Hash>] data The data we wish to Groups
200
+ #
201
+ # @return [Hash] The input data grouped by the first key
202
+ def group_by_first_key(data)
203
+ data.group_by { |el| el.values.first.to_s }
204
+ end
205
+ end
206
+
207
+ class XMLFormatter < Formatter
208
+ # Monkey patch (do as to_xml, but saner)
209
+
210
+ class ::Object
211
+ def to_xml(name = nil)
212
+ name ? %(<#{name}>#{self}</#{name}>) : to_s
213
+ end
214
+ end
215
+
216
+ class ::Array
217
+ def to_xml(array_name = :array, _item_name = :item)
218
+ %(<#{array_name} size="#{size}">) + map { |n|n.to_xml(:item) }.join + "</#{array_name}>"
219
+ end
220
+ end
221
+
222
+ class ::Hash
223
+ def to_xml(name = nil)
224
+ data = to_a.map { |k, v|v.to_xml(k) }.join
225
+ name ? "<#{name}>#{data}</#{name}>" : data
226
+ end
227
+ end
228
+
229
+ register :xml
230
+
231
+ # @return [String] The type of the current formatter: xml
232
+ def type
233
+ 'xml'
234
+ end
235
+
236
+ # Converts the given input data and corresponding fasta headers to XML.
237
+ # Currently ignores the fasta_mapper.
238
+ #
239
+ # @param [Array] data The data we wish to convert
240
+ #
241
+ # @param [Array<Array<String>>] _fasta_mapper Optional mapping between input
242
+ # data and corresponding fasta header. The data is represented as a list
243
+ # containing tuples where the first element is the fasta header and second
244
+ # element is the input data
245
+ #
246
+ # @return [String] The input data converted to the XML format.
247
+ def format(data, _fasta_mapper = nil)
248
+ # TODO: add fasta header based on fasta_mapper information
249
+ data.to_xml
250
+ end
251
+ end
252
+ end
data/lib/version.rb ADDED
@@ -0,0 +1,3 @@
1
+ module Unipept
2
+ VERSION = File.read(File.join(File.dirname(__FILE__), '..', 'VERSION')).chomp
3
+ end