dataverse 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
data/bin/console ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'dotenv/load'
5
+
6
+ require "bundler/setup"
7
+ require "dataverse"
8
+
9
+ # You can add fixtures and/or initialization code here to make experimenting
10
+ # with your gem easier. You can also use a different console, if you like.
11
+
12
+ require 'awesome_print'
13
+ if ENV['IRB_CONSOLE']
14
+ require "irb"
15
+ IRB.start
16
+ else
17
+ require "pry"
18
+ Pry.start
19
+ end
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/dataverse.gemspec ADDED
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/dataverse/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "dataverse"
7
+ spec.version = Dataverse::VERSION
8
+ spec.authors = ["Kris Dekeyser"]
9
+ spec.email = ["kris.dekeyser@libis.be"]
10
+
11
+ spec.summary = "Dataverse API."
12
+ spec.description = "Dataverse.org API wrapper."
13
+ spec.homepage = "https://rubygems.org/gems/dataverse"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.4.0")
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/libis/dataverse_api"
19
+ spec.metadata["changelog_uri"] = "https://github.com/libis/dataverse_api/CHANGELOG.md"
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
24
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
25
+ end
26
+ spec.bindir = "exe"
27
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
28
+ spec.require_paths = ["lib"]
29
+
30
+ spec.add_dependency "rest-client", "~> 2.0"
31
+ end
data/lib/dataverse.rb ADDED
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "dataverse/version"
4
+ require_relative "dataverse/errors"
5
+ require_relative "dataverse/base"
6
+ require_relative "dataverse/dataverse"
7
+ require_relative "dataverse/dataset"
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rest-client'
4
+ require 'json'
5
+ require 'rexml/document'
6
+
7
+ require 'forwardable'
8
+
9
+ module Dataverse
10
+ class Base
11
+ extend Forwardable
12
+
13
+ attr_reader :api_data
14
+
15
+ def_delegators :@api_data, :[], :fetch, :keys, :dig
16
+
17
+ def refresh
18
+ init(get_data)
19
+ end
20
+
21
+ protected
22
+
23
+ def init(data)
24
+ @api_data = data
25
+ @api_data.freeze
26
+ end
27
+
28
+ def get_data
29
+ @api_data
30
+ end
31
+
32
+ public
33
+
34
+ def ==(other)
35
+ self.api_data == other.api_data
36
+ end
37
+
38
+ def eql?(other)
39
+ self == other
40
+ end
41
+
42
+ def hash
43
+ api_data.hash
44
+ end
45
+
46
+ protected
47
+
48
+ def api_call(url, **args)
49
+ self.class.api_call(url, **args)
50
+ end
51
+
52
+ def self.api_call(url, method: :get, headers: {}, params: {}, body: nil, format: :api, block: nil, options: {})
53
+
54
+ unless ENV.has_key?('API_URL') && ENV.has_key?('API_TOKEN')
55
+ raise Error.new("Set environment variables 'API_URL' and 'API_TOKEN'")
56
+ end
57
+
58
+ url = ENV['API_URL'].chomp('/') + '/' + url.sub(/^\//, '')
59
+
60
+ headers['X-Dataverse-key'] = ENV['API_TOKEN']
61
+ headers[:params] = params unless params.empty?
62
+
63
+ format = :block if block
64
+
65
+ case format
66
+ when :xml
67
+ headers[:accept] = :xml
68
+ headers[:content_type] ||= :xml
69
+ when :api, :json
70
+ headers[:accept] = :json
71
+ headers[:content_type] ||= :json
72
+ when :raw
73
+ options[:raw_response] = true
74
+ when :block
75
+ options[:block_response] = block
76
+ end
77
+
78
+ body = body.to_json if body.is_a?(Hash) && headers[:content_type] == :json
79
+ body = body.write if body.is_a?(REXML::Document) && headers[:content_type] == :xml
80
+
81
+ response = RestClient::Request.execute(
82
+ method: method,
83
+ url: url,
84
+ headers: headers,
85
+ payload: body,
86
+ # log: STDOUT,
87
+ **options
88
+ )
89
+
90
+ case format
91
+ when :api
92
+ data = JSON.parse(response.body)
93
+ raise Error.new(data['message']) unless data['status'] == 'OK'
94
+ return data['data']
95
+ when :xml
96
+ REXML::Document.new(response.body)
97
+ when :json
98
+ return JSON.parse(response.body)
99
+ when :raw, :block, :response
100
+ return response
101
+ when :status
102
+ return response.code
103
+ else
104
+ return response.body
105
+ end
106
+
107
+ rescue RestClient::Exception => e
108
+ if e.http_body =~ /^\s*{\s*"status"\s*:\s*"ERROR"\s*,\s*"message"\s*:\s*"/
109
+ regex = /lib\/dataverse\/(?!.*:in\s*`.*(api_)?call'$)/
110
+ raise Error.new(JSON.parse(e.http_body)['message'],
111
+ backtrace: e.backtrace.drop_while {|x| !regex.match?(x)}
112
+ )
113
+ end
114
+ raise
115
+ end
116
+
117
+ end
118
+ end
119
+
120
+ # if log = ENV['RESTCLIENT_LOG']
121
+ # RestClient.log = STDOUT if log.upcase == 'STDOUT'
122
+ # RestClient.log = STDERR if log.upcase == 'STDERR'
123
+ # RestClient.log = log
124
+ # end
@@ -0,0 +1,376 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'base'
4
+
5
+ module Dataverse
6
+ class Dataset < Base
7
+
8
+ attr_reader :id
9
+
10
+ def self.id(id)
11
+ Dataset.new(id)
12
+ end
13
+
14
+ def self.pid(pid)
15
+ data = api_call('datasets/:persistentId', params: {'persistentId' => pid})
16
+ Dataset.new(data['id'])
17
+ end
18
+
19
+ def self.create(data:, dataverse:)
20
+ new_dataset(dataverse, data)
21
+ end
22
+
23
+ def self.import(data:, dataverse:, pid:, publish: false, ddi: false)
24
+ new_dataset(dataverse, data, import: pid, publish: publish, ddi: ddi)
25
+ end
26
+
27
+ def delete
28
+ url = raise Error.new 'Can only delete draft version' unless draft_version
29
+ versions
30
+ result = call('versions/:draft', method: :delete)
31
+ @version_data.delete(:draft)
32
+ @metadata.delete(:draft)
33
+ @files.delete(:draft)
34
+ @version_numbers&.delete(:draft)
35
+ init({}) if published_versions.empty?
36
+ result['message']
37
+ end
38
+
39
+ def submit
40
+ call('submitForReview', method: post)
41
+ end
42
+
43
+ def reject(reason)
44
+ call('returnToAuthor', method: post, body: reason)
45
+ end
46
+
47
+ def publish(major: true)
48
+ result = call('actions/:publish', method: :post,
49
+ params: {type: major ? 'major' : 'minor'}, format: :status
50
+ )
51
+ return "Dataset #{pid} published" if result == 200
52
+ return "Dataset #{pid} waiting for review" if result == 202
53
+ end
54
+
55
+ def call(url, **args)
56
+ api_call("datasets/#{id}/#{url}", **args)
57
+ end
58
+
59
+ def pid(version: :latest)
60
+ version_data(version).fetch('datasetPersistentId')
61
+ end
62
+
63
+ def size
64
+ data = call("storagesize". params: {includCached: 'true'})
65
+ data['message'][/[,\d]+/].delete(',').to_i
66
+ end
67
+
68
+ def versions
69
+ @version_numbers ||= begin
70
+ data = [:latest, :published] + [draft_version].compact + published_versions
71
+ data.delete(:published) unless published_versions.size > 0
72
+ data
73
+ end
74
+ end
75
+
76
+ def draft_version
77
+ return :draft if @version_data.keys.include?(:draft)
78
+ end
79
+
80
+ def published_versions
81
+ @published_versions ||= call('versions').map do |x|
82
+ next unless x['versionState'] == 'RELEASED'
83
+ "#{x['versionNumber']}.#{x['versionMinorNumber']}".to_f
84
+ end.compact
85
+ end
86
+
87
+ def version(version = :latest)
88
+ resolve_version(version, raise_if_not_found: false)
89
+ end
90
+
91
+ def title(version: :latest)
92
+ metadata(version: version).fetch('title')
93
+ end
94
+
95
+ def author(version: :latest)
96
+ metadata(version: version).fetch('author').first.fetch('authorName')
97
+ end
98
+
99
+ def updated(version: :latest)
100
+ Time.parse(version_data(version).fetch('lastUpdateTime')).getlocal
101
+ end
102
+
103
+ def created(version: :latest)
104
+ Time.parse(version_data(version).fetch('createTime')).getlocal
105
+ end
106
+
107
+ def published(version: :published)
108
+ return nil unless version_data(version).has_key?('releaseTime')
109
+ Time.parse(version_data(version).fetch('releaseTime')).getlocal
110
+ end
111
+
112
+ def metadata_fields(version: :latest)
113
+ metadata(version: version)&.keys || []
114
+ end
115
+
116
+ MD_TYPES_XML=['ddi', 'oai_ddi', 'dcterms', 'oai_dc', 'Datacite', 'oai_datacite']
117
+ MD_TYPES_JSON=['schema.org', 'OAI_ORE', 'dataverse_json']
118
+ MD_TYPES=['rdm', 'raw'] + MD_TYPES_JSON + MD_TYPES_XML
119
+
120
+ def export_metadata(md_type)
121
+ return nil unless version(:published)
122
+ format = case md_type.to_s
123
+ when *MD_TYPES_XML
124
+ :xml
125
+ when *MD_TYPES_JSON
126
+ :json
127
+ when 'rdm'
128
+ return rdm_data
129
+ when 'raw'
130
+ return raw_data
131
+ else
132
+ raise Error.new("Unknown metadata format: '#{md_type}'")
133
+ end
134
+ api_call('datasets/export', params: {exporter: md_type, persistentId: pid}, format: format)
135
+ end
136
+
137
+ def rdm_data(version: :published)
138
+ return nil unless version(version)
139
+ api_data
140
+ .merge(version_data(version))
141
+ .merge('metadata' => metadata(version: version))
142
+ .merge('files' => files(version: version))
143
+ end
144
+
145
+ def raw_data(version: :latest, with_files: false)
146
+ result = api_data.dup.merge(version_data(resolve_version(version)))
147
+ result['metadataBlocks'] = call("/versions/#{version_string(version)}/metadata")
148
+ result['files'] = call("/versions/#{version_string(version)}/files") if with_files
149
+ { 'datasetVersion' => result }
150
+ end
151
+
152
+ def metadata(version: :latest)
153
+ @metadata[resolve_version(version)] || {}
154
+ end
155
+
156
+ def files(version: :latest)
157
+ @files[resolve_version(version)] || []
158
+ end
159
+
160
+ def download_size(version: :latest)
161
+ data = call("versions/#{version_string(version)}/downloadsize")
162
+ data['message'][/[,\d]+/].delete(',').to_i
163
+ end
164
+
165
+ def download(filename = 'dataverse_files.zip', version: nil)
166
+ if version
167
+ v = version_string(version)
168
+ raise Error.new("Version '#{version}' does not exist") unless v
169
+ version = v
170
+ end
171
+ File.open(filename, 'w') do |f|
172
+ size = 0
173
+ block = proc do |response|
174
+ response.value
175
+ response.read_body do |chunk|
176
+ size += chunk.size
177
+ f.write chunk
178
+ end
179
+ rescue Net::HTTPServerException
180
+ return false
181
+ end
182
+ url = 'access/dataset/:persistentId'
183
+ url += "/versions/#{version}" if version
184
+ params = {persistentId: pid}
185
+ api_call(url, params: params, block: block)
186
+ f.close
187
+ size
188
+ end
189
+ end
190
+
191
+ protected
192
+
193
+ def initialize(id)
194
+ @id = id
195
+ init(get_data)
196
+ end
197
+
198
+ def init(data)
199
+ @version_data = {}
200
+ @metadata = {}
201
+ @files = {}
202
+ @version_numbers = nil
203
+ @published_versions = nil
204
+ super(process_data(data))
205
+ end
206
+
207
+ def get_data
208
+ api_call("datasets/#{id}")
209
+ end
210
+
211
+ def resolve_version(version, raise_if_not_found: true)
212
+ _version = version
213
+
214
+ version = case version
215
+ when ':draft', 'draft'
216
+ :draft
217
+ when ':latest', 'latest'
218
+ :latest
219
+ when ':published', 'published', ':latest-published', 'latest-published'
220
+ :published
221
+ when Numeric, String
222
+ version.to_f
223
+ else
224
+ version
225
+ end
226
+
227
+ case version
228
+ when :latest
229
+ version = draft_version || published_versions.max
230
+ when :published
231
+ version = published_versions.max
232
+ end
233
+
234
+ unless @version_data.keys.include?(version)
235
+ version = versions.find {|x| x == version}
236
+ raise VersionError.new(_version) if version.nil? && raise_if_not_found
237
+ return nil unless version
238
+ data = call("versions/#{version}")
239
+ process_version_data(data)
240
+ end
241
+
242
+ version
243
+ end
244
+
245
+ def version_string(version)
246
+ v = resolve_version(version)
247
+ case v
248
+ when Symbol
249
+ ":#{v}"
250
+ when Numeric
251
+ v.to_s
252
+ else
253
+ v
254
+ end
255
+ end
256
+
257
+ def version_data(version)
258
+ data = @version_data[resolve_version(version)].transform_keys {|k| k == 'id' ? 'versionId' : k}
259
+ end
260
+
261
+ private
262
+
263
+ def process_data(data)
264
+ return {} if data.nil? || data.empty?
265
+ version_data = data.delete('latestVersion')
266
+ process_version_data(version_data)
267
+ data
268
+ end
269
+
270
+ def process_version_data(data)
271
+ metadata = pack_metadata(data.delete('metadataBlocks'))
272
+ files = pack_files(data.delete('files'))
273
+ version = get_version_number(data)
274
+ store_data(version, data, metadata, files)
275
+ version
276
+ end
277
+
278
+ def get_version_number(data)
279
+ case data['versionState']
280
+ when 'DRAFT'
281
+ :draft
282
+ when 'RELEASED'
283
+ "#{data['versionNumber']}.#{data['versionMinorNumber']}".to_f
284
+ else
285
+ raise Error.new("Unsupported version state: '#{data['versionState']}")
286
+ end
287
+ end
288
+
289
+ def store_data(version, data, metadata, files)
290
+ @version_data[version] = data.freeze
291
+ @metadata[version] = metadata.freeze
292
+ @files[version] = files.freeze
293
+ end
294
+
295
+ def pack_metadata(metadata)
296
+ data = {}
297
+ metadata.each_value do |block|
298
+ block['fields'].each do |field|
299
+ data[field['typeName']] = field_to_value(field)
300
+ end
301
+ end
302
+ data
303
+ end
304
+
305
+ def pack_files(files)
306
+ files.map do |file|
307
+ detail = file.delete('dataFile')
308
+ file.merge(detail)
309
+ end
310
+ end
311
+
312
+ def field_to_value(field)
313
+ case field['typeClass']
314
+ when 'primitive'
315
+ return field['value']
316
+ when 'controlledVocabulary'
317
+ return field['value']
318
+ when 'compound'
319
+ compound_to_value(field['value'])
320
+ else
321
+ raise Error.new("Unsupported typeClass: '#{field['typeClass']}'")
322
+ end
323
+ end
324
+
325
+ def compound_to_value(data)
326
+ return data.map {|x| compound_to_value(x)} if data.is_a?(Array)
327
+ hash = {}
328
+ data.values.each do |v|
329
+ hash[v['typeName']] = field_to_value(v)
330
+ end
331
+ hash
332
+ end
333
+
334
+ def self.parse(dataverse, data, import: nil, publish: false, ddi: false)
335
+
336
+ dataverse = dataverse.id if dataverse.is_a?(Dataverse)
337
+
338
+ data = StringIO.new(data.to_json) if data.is_a?(Hash)
339
+
340
+ if data.is_a?(String)
341
+ begin
342
+ if File.exist?(data)
343
+ data = File.open(data, 'r')
344
+ elsif ddi || JSON::parse(data)
345
+ data = StringIO.new(data)
346
+ end
347
+ rescue JSON::ParserError, File
348
+ data = nil
349
+ end
350
+ end
351
+
352
+ unless data.is_a?(File) || data.is_a?(StringIO)
353
+ raise Error.new("Data could not be parsed. Should be a Hash, filename or JSON string.")
354
+ end
355
+
356
+ url = "dataverses/#{dataverse}/datasets"
357
+ url += '/:import' if import
358
+
359
+ params = {release: publish ? 'yes' : 'no'}
360
+ params[:pid] = import if import
361
+
362
+ headers = {content_type: :json}
363
+ headers[:content_type] = :xml if ddi
364
+
365
+ result = api_call(url, method: :post, headers: headers, body: data, params: params)
366
+ puts result
367
+
368
+ return Dataset.id(result['id'])
369
+
370
+ ensure
371
+ data.close if data.is_a?(File)
372
+
373
+ end
374
+
375
+ end
376
+ end