dataverse 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
data/bin/console ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'dotenv/load'
5
+
6
+ require "bundler/setup"
7
+ require "dataverse"
8
+
9
+ # You can add fixtures and/or initialization code here to make experimenting
10
+ # with your gem easier. You can also use a different console, if you like.
11
+
12
+ require 'awesome_print'
13
+ if ENV['IRB_CONSOLE']
14
+ require "irb"
15
+ IRB.start
16
+ else
17
+ require "pry"
18
+ Pry.start
19
+ end
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/dataverse.gemspec ADDED
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/dataverse/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "dataverse"
7
+ spec.version = Dataverse::VERSION
8
+ spec.authors = ["Kris Dekeyser"]
9
+ spec.email = ["kris.dekeyser@libis.be"]
10
+
11
+ spec.summary = "Dataverse API."
12
+ spec.description = "Dataverse.org API wrapper."
13
+ spec.homepage = "https://rubygems.org/gems/dataverse"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.4.0")
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/libis/dataverse_api"
19
+ spec.metadata["changelog_uri"] = "https://github.com/libis/dataverse_api/CHANGELOG.md"
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
24
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
25
+ end
26
+ spec.bindir = "exe"
27
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
28
+ spec.require_paths = ["lib"]
29
+
30
+ spec.add_dependency "rest-client", "~> 2.0"
31
+ end
data/lib/dataverse.rb ADDED
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "dataverse/version"
4
+ require_relative "dataverse/errors"
5
+ require_relative "dataverse/base"
6
+ require_relative "dataverse/dataverse"
7
+ require_relative "dataverse/dataset"
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rest-client'
4
+ require 'json'
5
+ require 'rexml/document'
6
+
7
+ require 'forwardable'
8
+
9
+ module Dataverse
10
+ class Base
11
+ extend Forwardable
12
+
13
+ attr_reader :api_data
14
+
15
+ def_delegators :@api_data, :[], :fetch, :keys, :dig
16
+
17
+ def refresh
18
+ init(get_data)
19
+ end
20
+
21
+ protected
22
+
23
+ def init(data)
24
+ @api_data = data
25
+ @api_data.freeze
26
+ end
27
+
28
+ def get_data
29
+ @api_data
30
+ end
31
+
32
+ public
33
+
34
+ def ==(other)
35
+ self.api_data == other.api_data
36
+ end
37
+
38
+ def eql?(other)
39
+ self == other
40
+ end
41
+
42
+ def hash
43
+ api_data.hash
44
+ end
45
+
46
+ protected
47
+
48
+ def api_call(url, **args)
49
+ self.class.api_call(url, **args)
50
+ end
51
+
52
+ def self.api_call(url, method: :get, headers: {}, params: {}, body: nil, format: :api, block: nil, options: {})
53
+
54
+ unless ENV.has_key?('API_URL') && ENV.has_key?('API_TOKEN')
55
+ raise Error.new("Set environment variables 'API_URL' and 'API_TOKEN'")
56
+ end
57
+
58
+ url = ENV['API_URL'].chomp('/') + '/' + url.sub(/^\//, '')
59
+
60
+ headers['X-Dataverse-key'] = ENV['API_TOKEN']
61
+ headers[:params] = params unless params.empty?
62
+
63
+ format = :block if block
64
+
65
+ case format
66
+ when :xml
67
+ headers[:accept] = :xml
68
+ headers[:content_type] ||= :xml
69
+ when :api, :json
70
+ headers[:accept] = :json
71
+ headers[:content_type] ||= :json
72
+ when :raw
73
+ options[:raw_response] = true
74
+ when :block
75
+ options[:block_response] = block
76
+ end
77
+
78
+ body = body.to_json if body.is_a?(Hash) && headers[:content_type] == :json
79
+ body = body.write if body.is_a?(REXML::Document) && headers[:content_type] == :xml
80
+
81
+ response = RestClient::Request.execute(
82
+ method: method,
83
+ url: url,
84
+ headers: headers,
85
+ payload: body,
86
+ # log: STDOUT,
87
+ **options
88
+ )
89
+
90
+ case format
91
+ when :api
92
+ data = JSON.parse(response.body)
93
+ raise Error.new(data['message']) unless data['status'] == 'OK'
94
+ return data['data']
95
+ when :xml
96
+ REXML::Document.new(response.body)
97
+ when :json
98
+ return JSON.parse(response.body)
99
+ when :raw, :block, :response
100
+ return response
101
+ when :status
102
+ return response.code
103
+ else
104
+ return response.body
105
+ end
106
+
107
+ rescue RestClient::Exception => e
108
+ if e.http_body =~ /^\s*{\s*"status"\s*:\s*"ERROR"\s*,\s*"message"\s*:\s*"/
109
+ regex = /lib\/dataverse\/(?!.*:in\s*`.*(api_)?call'$)/
110
+ raise Error.new(JSON.parse(e.http_body)['message'],
111
+ backtrace: e.backtrace.drop_while {|x| !regex.match?(x)}
112
+ )
113
+ end
114
+ raise
115
+ end
116
+
117
+ end
118
+ end
119
+
120
+ # if log = ENV['RESTCLIENT_LOG']
121
+ # RestClient.log = STDOUT if log.upcase == 'STDOUT'
122
+ # RestClient.log = STDERR if log.upcase == 'STDERR'
123
+ # RestClient.log = log
124
+ # end
@@ -0,0 +1,376 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'base'
4
+
5
+ module Dataverse
6
+ class Dataset < Base
7
+
8
+ attr_reader :id
9
+
10
+ def self.id(id)
11
+ Dataset.new(id)
12
+ end
13
+
14
+ def self.pid(pid)
15
+ data = api_call('datasets/:persistentId', params: {'persistentId' => pid})
16
+ Dataset.new(data['id'])
17
+ end
18
+
19
+ def self.create(data:, dataverse:)
20
+ new_dataset(dataverse, data)
21
+ end
22
+
23
+ def self.import(data:, dataverse:, pid:, publish: false, ddi: false)
24
+ new_dataset(dataverse, data, import: pid, publish: publish, ddi: ddi)
25
+ end
26
+
27
+ def delete
28
+ url = raise Error.new 'Can only delete draft version' unless draft_version
29
+ versions
30
+ result = call('versions/:draft', method: :delete)
31
+ @version_data.delete(:draft)
32
+ @metadata.delete(:draft)
33
+ @files.delete(:draft)
34
+ @version_numbers&.delete(:draft)
35
+ init({}) if published_versions.empty?
36
+ result['message']
37
+ end
38
+
39
+ def submit
40
+ call('submitForReview', method: post)
41
+ end
42
+
43
+ def reject(reason)
44
+ call('returnToAuthor', method: post, body: reason)
45
+ end
46
+
47
+ def publish(major: true)
48
+ result = call('actions/:publish', method: :post,
49
+ params: {type: major ? 'major' : 'minor'}, format: :status
50
+ )
51
+ return "Dataset #{pid} published" if result == 200
52
+ return "Dataset #{pid} waiting for review" if result == 202
53
+ end
54
+
55
+ def call(url, **args)
56
+ api_call("datasets/#{id}/#{url}", **args)
57
+ end
58
+
59
+ def pid(version: :latest)
60
+ version_data(version).fetch('datasetPersistentId')
61
+ end
62
+
63
+ def size
64
+ data = call("storagesize". params: {includCached: 'true'})
65
+ data['message'][/[,\d]+/].delete(',').to_i
66
+ end
67
+
68
+ def versions
69
+ @version_numbers ||= begin
70
+ data = [:latest, :published] + [draft_version].compact + published_versions
71
+ data.delete(:published) unless published_versions.size > 0
72
+ data
73
+ end
74
+ end
75
+
76
+ def draft_version
77
+ return :draft if @version_data.keys.include?(:draft)
78
+ end
79
+
80
+ def published_versions
81
+ @published_versions ||= call('versions').map do |x|
82
+ next unless x['versionState'] == 'RELEASED'
83
+ "#{x['versionNumber']}.#{x['versionMinorNumber']}".to_f
84
+ end.compact
85
+ end
86
+
87
+ def version(version = :latest)
88
+ resolve_version(version, raise_if_not_found: false)
89
+ end
90
+
91
+ def title(version: :latest)
92
+ metadata(version: version).fetch('title')
93
+ end
94
+
95
+ def author(version: :latest)
96
+ metadata(version: version).fetch('author').first.fetch('authorName')
97
+ end
98
+
99
+ def updated(version: :latest)
100
+ Time.parse(version_data(version).fetch('lastUpdateTime')).getlocal
101
+ end
102
+
103
+ def created(version: :latest)
104
+ Time.parse(version_data(version).fetch('createTime')).getlocal
105
+ end
106
+
107
+ def published(version: :published)
108
+ return nil unless version_data(version).has_key?('releaseTime')
109
+ Time.parse(version_data(version).fetch('releaseTime')).getlocal
110
+ end
111
+
112
+ def metadata_fields(version: :latest)
113
+ metadata(version: version)&.keys || []
114
+ end
115
+
116
+ MD_TYPES_XML=['ddi', 'oai_ddi', 'dcterms', 'oai_dc', 'Datacite', 'oai_datacite']
117
+ MD_TYPES_JSON=['schema.org', 'OAI_ORE', 'dataverse_json']
118
+ MD_TYPES=['rdm', 'raw'] + MD_TYPES_JSON + MD_TYPES_XML
119
+
120
+ def export_metadata(md_type)
121
+ return nil unless version(:published)
122
+ format = case md_type.to_s
123
+ when *MD_TYPES_XML
124
+ :xml
125
+ when *MD_TYPES_JSON
126
+ :json
127
+ when 'rdm'
128
+ return rdm_data
129
+ when 'raw'
130
+ return raw_data
131
+ else
132
+ raise Error.new("Unknown metadata format: '#{md_type}'")
133
+ end
134
+ api_call('datasets/export', params: {exporter: md_type, persistentId: pid}, format: format)
135
+ end
136
+
137
+ def rdm_data(version: :published)
138
+ return nil unless version(version)
139
+ api_data
140
+ .merge(version_data(version))
141
+ .merge('metadata' => metadata(version: version))
142
+ .merge('files' => files(version: version))
143
+ end
144
+
145
+ def raw_data(version: :latest, with_files: false)
146
+ result = api_data.dup.merge(version_data(resolve_version(version)))
147
+ result['metadataBlocks'] = call("/versions/#{version_string(version)}/metadata")
148
+ result['files'] = call("/versions/#{version_string(version)}/files") if with_files
149
+ { 'datasetVersion' => result }
150
+ end
151
+
152
+ def metadata(version: :latest)
153
+ @metadata[resolve_version(version)] || {}
154
+ end
155
+
156
+ def files(version: :latest)
157
+ @files[resolve_version(version)] || []
158
+ end
159
+
160
+ def download_size(version: :latest)
161
+ data = call("versions/#{version_string(version)}/downloadsize")
162
+ data['message'][/[,\d]+/].delete(',').to_i
163
+ end
164
+
165
+ def download(filename = 'dataverse_files.zip', version: nil)
166
+ if version
167
+ v = version_string(version)
168
+ raise Error.new("Version '#{version}' does not exist") unless v
169
+ version = v
170
+ end
171
+ File.open(filename, 'w') do |f|
172
+ size = 0
173
+ block = proc do |response|
174
+ response.value
175
+ response.read_body do |chunk|
176
+ size += chunk.size
177
+ f.write chunk
178
+ end
179
+ rescue Net::HTTPServerException
180
+ return false
181
+ end
182
+ url = 'access/dataset/:persistentId'
183
+ url += "/versions/#{version}" if version
184
+ params = {persistentId: pid}
185
+ api_call(url, params: params, block: block)
186
+ f.close
187
+ size
188
+ end
189
+ end
190
+
191
+ protected
192
+
193
+ def initialize(id)
194
+ @id = id
195
+ init(get_data)
196
+ end
197
+
198
+ def init(data)
199
+ @version_data = {}
200
+ @metadata = {}
201
+ @files = {}
202
+ @version_numbers = nil
203
+ @published_versions = nil
204
+ super(process_data(data))
205
+ end
206
+
207
+ def get_data
208
+ api_call("datasets/#{id}")
209
+ end
210
+
211
+ def resolve_version(version, raise_if_not_found: true)
212
+ _version = version
213
+
214
+ version = case version
215
+ when ':draft', 'draft'
216
+ :draft
217
+ when ':latest', 'latest'
218
+ :latest
219
+ when ':published', 'published', ':latest-published', 'latest-published'
220
+ :published
221
+ when Numeric, String
222
+ version.to_f
223
+ else
224
+ version
225
+ end
226
+
227
+ case version
228
+ when :latest
229
+ version = draft_version || published_versions.max
230
+ when :published
231
+ version = published_versions.max
232
+ end
233
+
234
+ unless @version_data.keys.include?(version)
235
+ version = versions.find {|x| x == version}
236
+ raise VersionError.new(_version) if version.nil? && raise_if_not_found
237
+ return nil unless version
238
+ data = call("versions/#{version}")
239
+ process_version_data(data)
240
+ end
241
+
242
+ version
243
+ end
244
+
245
+ def version_string(version)
246
+ v = resolve_version(version)
247
+ case v
248
+ when Symbol
249
+ ":#{v}"
250
+ when Numeric
251
+ v.to_s
252
+ else
253
+ v
254
+ end
255
+ end
256
+
257
+ def version_data(version)
258
+ data = @version_data[resolve_version(version)].transform_keys {|k| k == 'id' ? 'versionId' : k}
259
+ end
260
+
261
+ private
262
+
263
+ def process_data(data)
264
+ return {} if data.nil? || data.empty?
265
+ version_data = data.delete('latestVersion')
266
+ process_version_data(version_data)
267
+ data
268
+ end
269
+
270
+ def process_version_data(data)
271
+ metadata = pack_metadata(data.delete('metadataBlocks'))
272
+ files = pack_files(data.delete('files'))
273
+ version = get_version_number(data)
274
+ store_data(version, data, metadata, files)
275
+ version
276
+ end
277
+
278
+ def get_version_number(data)
279
+ case data['versionState']
280
+ when 'DRAFT'
281
+ :draft
282
+ when 'RELEASED'
283
+ "#{data['versionNumber']}.#{data['versionMinorNumber']}".to_f
284
+ else
285
+ raise Error.new("Unsupported version state: '#{data['versionState']}")
286
+ end
287
+ end
288
+
289
+ def store_data(version, data, metadata, files)
290
+ @version_data[version] = data.freeze
291
+ @metadata[version] = metadata.freeze
292
+ @files[version] = files.freeze
293
+ end
294
+
295
+ def pack_metadata(metadata)
296
+ data = {}
297
+ metadata.each_value do |block|
298
+ block['fields'].each do |field|
299
+ data[field['typeName']] = field_to_value(field)
300
+ end
301
+ end
302
+ data
303
+ end
304
+
305
+ def pack_files(files)
306
+ files.map do |file|
307
+ detail = file.delete('dataFile')
308
+ file.merge(detail)
309
+ end
310
+ end
311
+
312
+ def field_to_value(field)
313
+ case field['typeClass']
314
+ when 'primitive'
315
+ return field['value']
316
+ when 'controlledVocabulary'
317
+ return field['value']
318
+ when 'compound'
319
+ compound_to_value(field['value'])
320
+ else
321
+ raise Error.new("Unsupported typeClass: '#{field['typeClass']}'")
322
+ end
323
+ end
324
+
325
+ def compound_to_value(data)
326
+ return data.map {|x| compound_to_value(x)} if data.is_a?(Array)
327
+ hash = {}
328
+ data.values.each do |v|
329
+ hash[v['typeName']] = field_to_value(v)
330
+ end
331
+ hash
332
+ end
333
+
334
+ def self.parse(dataverse, data, import: nil, publish: false, ddi: false)
335
+
336
+ dataverse = dataverse.id if dataverse.is_a?(Dataverse)
337
+
338
+ data = StringIO.new(data.to_json) if data.is_a?(Hash)
339
+
340
+ if data.is_a?(String)
341
+ begin
342
+ if File.exist?(data)
343
+ data = File.open(data, 'r')
344
+ elsif ddi || JSON::parse(data)
345
+ data = StringIO.new(data)
346
+ end
347
+ rescue JSON::ParserError, File
348
+ data = nil
349
+ end
350
+ end
351
+
352
+ unless data.is_a?(File) || data.is_a?(StringIO)
353
+ raise Error.new("Data could not be parsed. Should be a Hash, filename or JSON string.")
354
+ end
355
+
356
+ url = "dataverses/#{dataverse}/datasets"
357
+ url += '/:import' if import
358
+
359
+ params = {release: publish ? 'yes' : 'no'}
360
+ params[:pid] = import if import
361
+
362
+ headers = {content_type: :json}
363
+ headers[:content_type] = :xml if ddi
364
+
365
+ result = api_call(url, method: :post, headers: headers, body: data, params: params)
366
+ puts result
367
+
368
+ return Dataset.id(result['id'])
369
+
370
+ ensure
371
+ data.close if data.is_a?(File)
372
+
373
+ end
374
+
375
+ end
376
+ end