toccatore 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +50 -0
  3. data/.travis.yml +19 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +3 -0
  6. data/Gemfile.lock +110 -0
  7. data/LICENSE.md +21 -0
  8. data/README.md +28 -0
  9. data/bin/toccatore +5 -0
  10. data/lib/toccatore.rb +1 -0
  11. data/lib/toccatore/base.rb +361 -0
  12. data/lib/toccatore/cli.rb +38 -0
  13. data/lib/toccatore/orcid_update.rb +56 -0
  14. data/lib/toccatore/version.rb +3 -0
  15. data/spec/cli_spec.rb +29 -0
  16. data/spec/fixtures/orcid_update.json +1015 -0
  17. data/spec/fixtures/orcid_update_nil.json +12 -0
  18. data/spec/fixtures/vcr_cassettes/Toccatore_CLI/orcid_update/should_fail.yml +149 -0
  19. data/spec/fixtures/vcr_cassettes/Toccatore_CLI/orcid_update/should_succeed.yml +3453 -0
  20. data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Metadata_Search_API.yml +38 -0
  21. data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_data/should_report_if_there_are_works_returned_by_the_Datacite_Metadata_Search_API.yml +149 -0
  22. data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_total/with_no_works.yml +38 -0
  23. data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_total/with_works.yml +38 -0
  24. data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/push_data/should_report_if_there_are_works_returned_by_the_Datacite_Metadata_Search_API.yml +3307 -0
  25. data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/queue_jobs/should_report_if_there_are_no_works_returned_by_the_Datacite_Metadata_Search_API.yml +38 -0
  26. data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/queue_jobs/should_report_if_there_are_works_returned_by_the_Datacite_Metadata_Search_API.yml +236 -0
  27. data/spec/orcid_update_spec.rb +120 -0
  28. data/spec/spec_helper.rb +91 -0
  29. data/toccatore.gemspec +37 -0
  30. metadata +314 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 13870f2c02d23ff37a9f4de7af84bfa353af388d
4
+ data.tar.gz: a356318891ee353f9dab772c9f74b1629a21b6dc
5
+ SHA512:
6
+ metadata.gz: 63cd648459e66ce0bda3434ec311e35b12fd66d617882cb59c78a1c9091a6448c9ff2f43c0bd187d1b27c8e0cfabfd861f11325d73de8245be8419f3f79a6086
7
+ data.tar.gz: c0a6a5c484535dace3a5a677e73c97858f3f27bcab2383917d9d7ea631242abd869ed85b6d5eca275a87ab55ac072938b2a6c96093e50cdc13d3b259fea2573c
@@ -0,0 +1,50 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ .env
15
+
16
+ ## Specific to RubyMotion:
17
+ .dat*
18
+ .repl_history
19
+ build/
20
+ *.bridgesupport
21
+ build-iPhoneOS/
22
+ build-iPhoneSimulator/
23
+
24
+ ## Specific to RubyMotion (use of CocoaPods):
25
+ #
26
+ # We recommend against adding the Pods directory to your .gitignore. However
27
+ # you should judge for yourself, the pros and cons are mentioned at:
28
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
29
+ #
30
+ # vendor/Pods/
31
+
32
+ ## Documentation cache and generated files:
33
+ /.yardoc/
34
+ /_yardoc/
35
+ /doc/
36
+ /rdoc/
37
+
38
+ ## Environment normalization:
39
+ /.bundle/
40
+ /vendor/bundle
41
+ /lib/bundler/man/
42
+
43
+ # for a library or gem, you might want to ignore these files since the code is
44
+ # intended to run in multiple environments; otherwise, check them in:
45
+ # Gemfile.lock
46
+ # .ruby-version
47
+ # .ruby-gemset
48
+
49
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
50
+ .rvmrc
@@ -0,0 +1,19 @@
1
+ language: ruby
2
+ cache: bundler
3
+ sudo: false
4
+ rvm:
5
+ - 2.3.1
6
+ script:
7
+ - bundle exec rspec
8
+ - bundle exec codeclimate-test-reporter
9
+ notifications:
10
+ slack: datacite:Wt8En0ALoTA6Kjc5EOKNDWxN
11
+ email: false
12
+ deploy:
13
+ provider: rubygems
14
+ api_key:
15
+ secure: eLWrKUf5as7RNOJpm4/viPR97yOpYjj7yiwblXf1axtBr/nJCi/ZZ3UyakHbsM88DI+sF52A56cum+0KkWTQuzBVR6TXB3u/UclC0z1pyjv6QClm3qLi5/lx6f//7K9FmdB130CuAUbJgUBiIyDElPJK4bE+teBUWft/Pb49Yy1/5M5F0VV/lZrOQ/O6js9cdmxxmp8DfC+UMmw1I982VGJ1xTW6vhWlZ3pA+PLi7KkdxzA5f3/SQLIC8ij6i9FLFXz37qs5ynumzDKiyshKoVZ7mVeR0SjmGAteAXDqkwmknJPMJTHxc2dvxDpZjB1KguBw6Ohs/Bv+R14bzyXepkBaZ8Mo++Ro0EqRdP9tdLbhhtJJ4+MrVPqYVL+JakAcJY3Y58e/j4ZOvbjrWFJ3oMljDpxzSUNvMvpWCQ8NlEDAhOEG3b4jbERl+vEhjYlcDVeSxBrxA02wXCoWTrZxpkRZY8qdgA3O21W+pcixEGIYT/Ox0jTfWdQUhqjJM16qcN13i6SMzeC1FaihXkA1AltUtEIgXA+uJA9aMrhHGYktMc2XkUqO4blna6ExzBwvafzZgor47oCOh1VLxpas+5Hui3YKEnmHn8sxKa26WuQJomnXgXfhB8n+eB/KWunWOPiBvJZDMU2C6AvP6N1MDTKvQWp2bwU5Jamt9vNjTtM=
16
+ gem: toccatore
17
+ on:
18
+ tags: true
19
+ repo: datacite/toccatore
@@ -0,0 +1,5 @@
1
+ ## v.0.1 (January 30, 2017)
2
+
3
+ [toccatore 0.1](https://github.com/datacite/toccatore/releases/tag/v.0.1) was released on January 30, 2017:
4
+
5
+ * initial release
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,110 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ toccatore (0.1)
5
+ activesupport (~> 4.2, >= 4.2.5)
6
+ dotenv (~> 2.1, >= 2.1.1)
7
+ gender_detector (~> 1.0)
8
+ maremma (~> 3.1)
9
+ namae (~> 0.11.0)
10
+ thor (~> 0.19)
11
+
12
+ GEM
13
+ remote: https://rubygems.org/
14
+ specs:
15
+ activesupport (4.2.7.1)
16
+ i18n (~> 0.7)
17
+ json (~> 1.7, >= 1.7.7)
18
+ minitest (~> 5.1)
19
+ thread_safe (~> 0.3, >= 0.3.4)
20
+ tzinfo (~> 1.1)
21
+ addressable (2.5.0)
22
+ public_suffix (~> 2.0, >= 2.0.2)
23
+ builder (3.2.3)
24
+ codeclimate-test-reporter (1.0.5)
25
+ simplecov
26
+ crack (0.4.3)
27
+ safe_yaml (~> 1.0.0)
28
+ diff-lcs (1.3)
29
+ docile (1.1.5)
30
+ dotenv (2.2.0)
31
+ excon (0.45.4)
32
+ faraday (0.9.2)
33
+ multipart-post (>= 1.2, < 3)
34
+ faraday-encoding (0.0.4)
35
+ faraday
36
+ faraday_middleware (0.10.1)
37
+ faraday (>= 0.7.4, < 1.0)
38
+ gender_detector (1.0.0)
39
+ hashdiff (0.3.2)
40
+ i18n (0.7.0)
41
+ json (1.8.6)
42
+ maremma (3.1.2)
43
+ activesupport (~> 4.2, >= 4.2.5)
44
+ addressable (~> 2.5)
45
+ builder (~> 3.2, >= 3.2.2)
46
+ excon (~> 0.45.0)
47
+ faraday (~> 0.9.2)
48
+ faraday-encoding (~> 0.0.1)
49
+ faraday_middleware (~> 0.10.0)
50
+ multi_json (~> 1.11.2)
51
+ nokogiri (~> 1.6.7)
52
+ oj (~> 2.13.1)
53
+ mini_portile2 (2.1.0)
54
+ minitest (5.10.1)
55
+ multi_json (1.11.3)
56
+ multipart-post (2.0.0)
57
+ namae (0.11.3)
58
+ nokogiri (1.6.8.1)
59
+ mini_portile2 (~> 2.1.0)
60
+ oj (2.13.1)
61
+ public_suffix (2.0.5)
62
+ rack (2.0.1)
63
+ rack-test (0.6.3)
64
+ rack (>= 1.0)
65
+ rake (12.0.0)
66
+ rspec (3.5.0)
67
+ rspec-core (~> 3.5.0)
68
+ rspec-expectations (~> 3.5.0)
69
+ rspec-mocks (~> 3.5.0)
70
+ rspec-core (3.5.4)
71
+ rspec-support (~> 3.5.0)
72
+ rspec-expectations (3.5.0)
73
+ diff-lcs (>= 1.2.0, < 2.0)
74
+ rspec-support (~> 3.5.0)
75
+ rspec-mocks (3.5.0)
76
+ diff-lcs (>= 1.2.0, < 2.0)
77
+ rspec-support (~> 3.5.0)
78
+ rspec-support (3.5.0)
79
+ safe_yaml (1.0.4)
80
+ simplecov (0.12.0)
81
+ docile (~> 1.1.0)
82
+ json (>= 1.8, < 3)
83
+ simplecov-html (~> 0.10.0)
84
+ simplecov-html (0.10.0)
85
+ thor (0.19.4)
86
+ thread_safe (0.3.5)
87
+ tzinfo (1.2.2)
88
+ thread_safe (~> 0.1)
89
+ vcr (3.0.3)
90
+ webmock (1.24.6)
91
+ addressable (>= 2.3.6)
92
+ crack (>= 0.3.2)
93
+ hashdiff
94
+
95
+ PLATFORMS
96
+ ruby
97
+
98
+ DEPENDENCIES
99
+ bundler (~> 1.0)
100
+ codeclimate-test-reporter (~> 1.0, >= 1.0.0)
101
+ rack-test (~> 0)
102
+ rake (~> 12.0)
103
+ rspec (~> 3.4)
104
+ simplecov (~> 0.12.0)
105
+ toccatore!
106
+ vcr (~> 3.0, >= 3.0.3)
107
+ webmock (~> 1.22, >= 1.22.3)
108
+
109
+ BUNDLED WITH
110
+ 1.12.5
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2017 DataCite
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,28 @@
1
+ # Toccatore
2
+
3
+ [![Build Status](https://travis-ci.org/datacite/toccatore.svg?branch=master)](https://travis-ci.org/datacite/toccatore)
4
+ [![Code Climate](https://codeclimate.com/github/datacite/toccatore/badges/gpa.svg)](https://codeclimate.com/github/datacite/toccatore)
5
+ [![Test Coverage](https://codeclimate.com/github/datacite/toccatore/badges/coverage.svg)](https://codeclimate.com/github/datacite/toccatore/coverage)
6
+
7
+ Command-line client for finding ORCID IDs in DataCite metadata.
8
+
9
+ ## Development
10
+
11
+ We use rspec for unit testing:
12
+
13
+ ```
14
+ bundle exec rspec
15
+ ```
16
+
17
+ Follow along via [Github Issues](https://github.com/datacite/toccatore/issues).
18
+
19
+ ### Note on Patches/Pull Requests
20
+
21
+ * Fork the project
22
+ * Write tests for your new feature or a test that reproduces a bug
23
+ * Implement your feature or make a bug fix
24
+ * Do not mess with Rakefile, version or history
25
+ * Commit, push and make a pull request. Bonus points for topical branches.
26
+
27
+ ## License
28
+ **toccatore** is released under the [MIT License](https://github.com/datacite/toccatore/blob/master/LICENSE.md).
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path("../../lib/toccatore", __FILE__)
4
+
5
+ Toccatore::CLI.start
@@ -0,0 +1 @@
1
+ require "toccatore/orcid_update"
@@ -0,0 +1,361 @@
1
+ require 'namae'
2
+ require 'gender_detector'
3
+
4
+ module Toccatore
5
+ class Base
6
+ # load ENV variables from .env file if it exists
7
+ env_file = File.expand_path("../../../.env", __FILE__)
8
+ if File.exist?(env_file)
9
+ require 'dotenv'
10
+ Dotenv.load! env_file
11
+ end
12
+
13
+ # load ENV variables from container environment if json file exists
14
+ # see https://github.com/phusion/baseimage-docker#envvar_dumps
15
+ env_json_file = "/etc/container_environment.json"
16
+ if File.exist?(env_json_file)
17
+ env_vars = JSON.parse(File.read(env_json_file))
18
+ env_vars.each { |k, v| ENV[k] = v }
19
+ end
20
+
21
+ def get_query_url(options={})
22
+ offset = options[:offset].to_i || 0
23
+ rows = options[:rows].presence || job_batch_size
24
+ from_date = options[:from_date].presence || (Time.now.to_date - 1.day).iso8601
25
+ until_date = options[:until_date].presence || Time.now.to_date.iso8601
26
+
27
+ updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
28
+ fq = "#{updated} AND has_metadata:true AND is_active:true"
29
+
30
+ params = { q: q,
31
+ start: offset,
32
+ rows: rows,
33
+ fl: "doi,creator,title,publisher,publicationYear,resourceTypeGeneral,datacentre_symbol,relatedIdentifier,nameIdentifier,xml,minted,updated",
34
+ fq: fq,
35
+ wt: "json" }
36
+ url + URI.encode_www_form(params)
37
+ end
38
+
39
+ def get_total(options={})
40
+ query_url = get_query_url(options.merge(rows: 0))
41
+ result = Maremma.get(query_url, options)
42
+ result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
43
+ end
44
+
45
+ def queue_jobs(options={})
46
+ total = get_total(options)
47
+
48
+ if total > 0
49
+ # walk through paginated results
50
+ total_pages = (total.to_f / job_batch_size).ceil
51
+
52
+ (0...total_pages).each do |page|
53
+ options[:offset] = page * job_batch_size
54
+ process_data(options)
55
+ end
56
+ end
57
+
58
+ # return number of works queued
59
+ total
60
+ end
61
+
62
+ def process_data(options = {})
63
+ data = get_data(options.merge(timeout: timeout, source_id: source_id))
64
+ data = parse_data(data, options.merge(source_id: source_id))
65
+
66
+ # push to deposit API if no error and we have collected works and/or events
67
+ # returns hash with number of deposits created, e.g. { total: 10 }
68
+ push_data(data, options)
69
+ end
70
+
71
+ def get_data(options={})
72
+ query_url = get_query_url(options)
73
+ Maremma.get(query_url, options)
74
+ end
75
+
76
+ def parse_data(result, options={})
77
+ return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
78
+
79
+ items = result.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
80
+ get_relations_with_related_works(items)
81
+ end
82
+
83
+ # push to Lagotto deposit API if no error and we have collected works
84
+ def push_data(items, options={})
85
+ return [] if items.empty?
86
+
87
+ Array(items).map do |item|
88
+ relation = item.fetch(:relation, {})
89
+ deposit = { "deposit" => { "subj_id" => relation.fetch("subj_id", nil),
90
+ "obj_id" => relation.fetch("obj_id", nil),
91
+ "relation_type_id" => relation.fetch("relation_type_id", nil),
92
+ "source_id" => relation.fetch("source_id", nil),
93
+ "publisher_id" => relation.fetch("publisher_id", nil),
94
+ "subj" => item.fetch(:subj, {}),
95
+ "obj" => item.fetch(:obj, {}),
96
+ "message_type" => item.fetch(:message_type, "relation"),
97
+ "prefix" => item.fetch(:prefix, nil),
98
+ "source_token" => uuid } }
99
+
100
+ Maremma.post push_url, data: deposit.to_json, content_type: 'json', token: access_token
101
+ end
102
+ end
103
+
104
+ def get_relations_with_related_works(items)
105
+ Array(items).reduce([]) do |sum, item|
106
+ doi = item.fetch("doi", nil)
107
+ prefix = doi[/^10\.\d{4,5}/]
108
+ pid = doi_as_url(doi)
109
+ type = item.fetch("resourceTypeGeneral", nil)
110
+ publisher_id = item.fetch("datacentre_symbol", nil)
111
+
112
+ xml = Base64.decode64(item.fetch('xml', "PGhzaD48L2hzaD4=\n"))
113
+ xml = Hash.from_xml(xml).fetch("resource", {})
114
+ authors = xml.fetch("creators", {}).fetch("creator", [])
115
+ authors = [authors] if authors.is_a?(Hash)
116
+
117
+ subj = { "pid" => pid,
118
+ "DOI" => doi,
119
+ "author" => get_hashed_authors(authors),
120
+ "title" => item.fetch("title", []).first,
121
+ "container-title" => item.fetch("publisher", nil),
122
+ "published" => item.fetch("publicationYear", nil),
123
+ "issued" => item.fetch("minted", nil),
124
+ "publisher_id" => publisher_id,
125
+ "registration_agency_id" => "datacite",
126
+ "tracked" => true,
127
+ "type" => type }
128
+
129
+ related_doi_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:DOI:.+/ }
130
+ sum += get_doi_relations(subj, related_doi_identifiers)
131
+
132
+ related_github_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:URL:https:\/\/github.com.+/ }
133
+ sum += get_github_relations(subj, related_github_identifiers)
134
+
135
+ name_identifiers = item.fetch('nameIdentifier', []).select { |id| id =~ /^ORCID:.+/ }
136
+ sum += get_contributions(subj, name_identifiers)
137
+
138
+ if source_id == "datacite_import"
139
+ sum += [{ prefix: prefix,
140
+ relation: { "subj_id" => subj["pid"],
141
+ "source_id" => source_id,
142
+ "publisher_id" => subj["publisher_id"],
143
+ "occurred_at" => subj["issued"] },
144
+ subj: subj }]
145
+ end
146
+
147
+ sum
148
+ end
149
+ end
150
+
151
+ def get_github_relations(subj, items)
152
+ prefix = subj["DOI"][/^10\.\d{4,5}/]
153
+
154
+ Array(items).reduce([]) do |sum, item|
155
+ raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)
156
+
157
+ # get parent repo
158
+ # code from https://github.com/octokit/octokit.rb/blob/master/lib/octokit/repository.rb
159
+ related_identifier = PostRank::URI.clean(related_identifier)
160
+ github_hash = github_from_url(related_identifier)
161
+ owner_url = github_as_owner_url(github_hash)
162
+ repo_url = github_as_repo_url(github_hash)
163
+
164
+ sum << { prefix: prefix,
165
+ relation: { "subj_id" => subj["pid"],
166
+ "obj_id" => related_identifier,
167
+ "relation_type_id" => raw_relation_type.underscore,
168
+ "source_id" => source_id,
169
+ "publisher_id" => subj["publisher_id"],
170
+ "registration_agency_id" => "github",
171
+ "occurred_at" => subj["issued"] },
172
+ subj: subj }
173
+
174
+ # if relatedIdentifier is release URL rather than repo URL
175
+ if related_identifier != repo_url
176
+ sum << { relation: { "subj_id" => related_identifier,
177
+ "obj_id" => repo_url,
178
+ "relation_type_id" => "is_part_of",
179
+ "source_id" => source_id,
180
+ "publisher_id" => "github",
181
+ "registration_agency_id" => "github" } }
182
+ end
183
+
184
+ sum << { message_type: "contribution",
185
+ relation: { "subj_id" => owner_url,
186
+ "obj_id" => repo_url,
187
+ "source_id" => "github_contributor",
188
+ "registration_agency_id" => "github" }}
189
+ end
190
+ end
191
+
192
+ def get_doi_relations(subj, items)
193
+ prefix = subj["DOI"][/^10\.\d{4,5}/]
194
+
195
+ Array(items).reduce([]) do |sum, item|
196
+ raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)
197
+ doi = related_identifier.strip.upcase
198
+ registration_agency = get_doi_ra(doi)
199
+
200
+ if source_id == "datacite_crossref" && registration_agency == "datacite"
201
+ sum
202
+ else
203
+ _source_id = registration_agency == "crossref" ? "datacite_crossref" : "datacite_related"
204
+ pid = doi_as_url(doi)
205
+
206
+ sum << { prefix: prefix,
207
+ relation: { "subj_id" => subj["pid"],
208
+ "obj_id" => pid,
209
+ "relation_type_id" => raw_relation_type.underscore,
210
+ "source_id" => _source_id,
211
+ "publisher_id" => subj["publisher_id"],
212
+ "registration_agency_id" => registration_agency,
213
+ "occurred_at" => subj["issued"] },
214
+ subj: subj }
215
+ end
216
+ end
217
+ end
218
+
219
+ # we are flipping subj and obj for contributions
220
+ def get_contributions(obj, items)
221
+ prefix = obj["DOI"][/^10\.\d{4,5}/]
222
+
223
+ Array(items).reduce([]) do |sum, item|
224
+ orcid = item.split(':', 2).last
225
+ orcid = validate_orcid(orcid)
226
+
227
+ return sum if orcid.nil?
228
+
229
+ sum << { prefix: prefix,
230
+ message_type: "contribution",
231
+ relation: { "subj_id" => orcid_as_url(orcid),
232
+ "obj_id" => obj["pid"],
233
+ "relation_type_id" => nil,
234
+ "source_id" => source_id,
235
+ "publisher_id" => obj["publisher_id"],
236
+ "registration_agency_id" => "datacite",
237
+ "occurred_at" => obj["issued"] },
238
+ obj: obj }
239
+ end
240
+ end
241
+
242
+ def config_fields
243
+ [:url, :push_url, :access_token]
244
+ end
245
+
246
+ def url
247
+ "https://search.datacite.org/api?"
248
+ end
249
+
250
+ def timeout
251
+ 120
252
+ end
253
+
254
+ def job_batch_size
255
+ 1000
256
+ end
257
+
258
+ # remove non-printing whitespace
259
+ def clean_doi(doi)
260
+ doi.gsub(/\u200B/, '')
261
+ end
262
+
263
+ def doi_from_url(url)
264
+ if /(http|https):\/\/(dx\.)?doi\.org\/(\w+)/.match(url)
265
+ uri = Addressable::URI.parse(url)
266
+ uri.path[1..-1].upcase
267
+ elsif url.starts_with?("doi:")
268
+ url[4..-1].upcase
269
+ end
270
+ end
271
+
272
+ def doi_as_url(doi)
273
+ Addressable::URI.encode("https://doi.org/#{clean_doi(doi)}") if doi.present?
274
+ end
275
+
276
+ def orcid_from_url(url)
277
+ Array(/\Ahttp:\/\/orcid\.org\/(.+)/.match(url)).last
278
+ end
279
+
280
+ def orcid_as_url(orcid)
281
+ "http://orcid.org/#{orcid}" if orcid.present?
282
+ end
283
+
284
+ def validate_orcid(orcid)
285
+ Array(/\A(?:http:\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(orcid)).last
286
+ end
287
+
288
+ # parse author string into CSL format
289
+ # only assume personal name when using sort-order: "Turing, Alan"
290
+ def get_one_author(author, options = {})
291
+ return { "literal" => "" } if author.strip.blank?
292
+
293
+ author = cleanup_author(author)
294
+ names = Namae.parse(author)
295
+
296
+ if names.blank? || is_personal_name?(author).blank?
297
+ { "literal" => author }
298
+ else
299
+ name = names.first
300
+
301
+ { "family" => name.family,
302
+ "given" => name.given }.compact
303
+ end
304
+ end
305
+
306
+ def cleanup_author(author)
307
+ # detect pattern "Smith J.", but not "Smith, John K."
308
+ author = author.gsub(/[[:space:]]([A-Z]\.)?(-?[A-Z]\.)$/, ', \1\2') unless author.include?(",")
309
+
310
+ # titleize strings
311
+ # remove non-standard space characters
312
+ author.my_titleize
313
+ .gsub(/[[:space:]]/, ' ')
314
+ end
315
+
316
+ def is_personal_name?(author)
317
+ return true if author.include?(",")
318
+
319
+ # lookup given name
320
+ name_detector.name_exists?(author.split.first)
321
+ end
322
+
323
+ # parse array of author strings into CSL format
324
+ def get_authors(authors, options={})
325
+ Array(authors).map { |author| get_one_author(author, options) }
326
+ end
327
+
328
+ # parse array of author hashes into CSL format
329
+ def get_hashed_authors(authors)
330
+ Array(authors).map { |author| get_one_hashed_author(author) }
331
+ end
332
+
333
+ def get_one_hashed_author(author)
334
+ raw_name = author.fetch("creatorName", nil)
335
+
336
+ author_hsh = get_one_author(raw_name)
337
+ author_hsh["ORCID"] = get_name_identifier(author)
338
+ author_hsh.compact
339
+ end
340
+
341
+ def get_name_identifier(author)
342
+ name_identifier = author.fetch("nameIdentifier", nil)
343
+ name_identifier_scheme = author.fetch("nameIdentifierScheme", "orcid").downcase
344
+ if name_identifier_scheme == "orcid" && name_identifier = validate_orcid(name_identifier)
345
+ "http://orcid.org/#{name_identifier}"
346
+ else
347
+ nil
348
+ end
349
+ end
350
+
351
+ def name_detector
352
+ GenderDetector.new
353
+ end
354
+ end
355
+ end
356
+
357
+ class String
358
+ def my_titleize
359
+ self.gsub(/(\b|_)(.)/) { "#{$1}#{$2.upcase}" }
360
+ end
361
+ end