toccatore 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +50 -0
- data/.travis.yml +19 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +110 -0
- data/LICENSE.md +21 -0
- data/README.md +28 -0
- data/bin/toccatore +5 -0
- data/lib/toccatore.rb +1 -0
- data/lib/toccatore/base.rb +361 -0
- data/lib/toccatore/cli.rb +38 -0
- data/lib/toccatore/orcid_update.rb +56 -0
- data/lib/toccatore/version.rb +3 -0
- data/spec/cli_spec.rb +29 -0
- data/spec/fixtures/orcid_update.json +1015 -0
- data/spec/fixtures/orcid_update_nil.json +12 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_CLI/orcid_update/should_fail.yml +149 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_CLI/orcid_update/should_succeed.yml +3453 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Metadata_Search_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_data/should_report_if_there_are_works_returned_by_the_Datacite_Metadata_Search_API.yml +149 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_total/with_no_works.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_total/with_works.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/push_data/should_report_if_there_are_works_returned_by_the_Datacite_Metadata_Search_API.yml +3307 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/queue_jobs/should_report_if_there_are_no_works_returned_by_the_Datacite_Metadata_Search_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/queue_jobs/should_report_if_there_are_works_returned_by_the_Datacite_Metadata_Search_API.yml +236 -0
- data/spec/orcid_update_spec.rb +120 -0
- data/spec/spec_helper.rb +91 -0
- data/toccatore.gemspec +37 -0
- metadata +314 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 13870f2c02d23ff37a9f4de7af84bfa353af388d
|
4
|
+
data.tar.gz: a356318891ee353f9dab772c9f74b1629a21b6dc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 63cd648459e66ce0bda3434ec311e35b12fd66d617882cb59c78a1c9091a6448c9ff2f43c0bd187d1b27c8e0cfabfd861f11325d73de8245be8419f3f79a6086
|
7
|
+
data.tar.gz: c0a6a5c484535dace3a5a677e73c97858f3f27bcab2383917d9d7ea631242abd869ed85b6d5eca275a87ab55ac072938b2a6c96093e50cdc13d3b259fea2573c
|
data/.gitignore
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/spec/examples.txt
|
9
|
+
/test/tmp/
|
10
|
+
/test/version_tmp/
|
11
|
+
/tmp/
|
12
|
+
|
13
|
+
# Used by dotenv library to load environment variables.
|
14
|
+
.env
|
15
|
+
|
16
|
+
## Specific to RubyMotion:
|
17
|
+
.dat*
|
18
|
+
.repl_history
|
19
|
+
build/
|
20
|
+
*.bridgesupport
|
21
|
+
build-iPhoneOS/
|
22
|
+
build-iPhoneSimulator/
|
23
|
+
|
24
|
+
## Specific to RubyMotion (use of CocoaPods):
|
25
|
+
#
|
26
|
+
# We recommend against adding the Pods directory to your .gitignore. However
|
27
|
+
# you should judge for yourself, the pros and cons are mentioned at:
|
28
|
+
# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
|
29
|
+
#
|
30
|
+
# vendor/Pods/
|
31
|
+
|
32
|
+
## Documentation cache and generated files:
|
33
|
+
/.yardoc/
|
34
|
+
/_yardoc/
|
35
|
+
/doc/
|
36
|
+
/rdoc/
|
37
|
+
|
38
|
+
## Environment normalization:
|
39
|
+
/.bundle/
|
40
|
+
/vendor/bundle
|
41
|
+
/lib/bundler/man/
|
42
|
+
|
43
|
+
# for a library or gem, you might want to ignore these files since the code is
|
44
|
+
# intended to run in multiple environments; otherwise, check them in:
|
45
|
+
# Gemfile.lock
|
46
|
+
# .ruby-version
|
47
|
+
# .ruby-gemset
|
48
|
+
|
49
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
50
|
+
.rvmrc
|
data/.travis.yml
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
language: ruby
|
2
|
+
cache: bundler
|
3
|
+
sudo: false
|
4
|
+
rvm:
|
5
|
+
- 2.3.1
|
6
|
+
script:
|
7
|
+
- bundle exec rspec
|
8
|
+
- bundle exec codeclimate-test-reporter
|
9
|
+
notifications:
|
10
|
+
slack: datacite:Wt8En0ALoTA6Kjc5EOKNDWxN
|
11
|
+
email: false
|
12
|
+
deploy:
|
13
|
+
provider: rubygems
|
14
|
+
api_key:
|
15
|
+
secure: eLWrKUf5as7RNOJpm4/viPR97yOpYjj7yiwblXf1axtBr/nJCi/ZZ3UyakHbsM88DI+sF52A56cum+0KkWTQuzBVR6TXB3u/UclC0z1pyjv6QClm3qLi5/lx6f//7K9FmdB130CuAUbJgUBiIyDElPJK4bE+teBUWft/Pb49Yy1/5M5F0VV/lZrOQ/O6js9cdmxxmp8DfC+UMmw1I982VGJ1xTW6vhWlZ3pA+PLi7KkdxzA5f3/SQLIC8ij6i9FLFXz37qs5ynumzDKiyshKoVZ7mVeR0SjmGAteAXDqkwmknJPMJTHxc2dvxDpZjB1KguBw6Ohs/Bv+R14bzyXepkBaZ8Mo++Ro0EqRdP9tdLbhhtJJ4+MrVPqYVL+JakAcJY3Y58e/j4ZOvbjrWFJ3oMljDpxzSUNvMvpWCQ8NlEDAhOEG3b4jbERl+vEhjYlcDVeSxBrxA02wXCoWTrZxpkRZY8qdgA3O21W+pcixEGIYT/Ox0jTfWdQUhqjJM16qcN13i6SMzeC1FaihXkA1AltUtEIgXA+uJA9aMrhHGYktMc2XkUqO4blna6ExzBwvafzZgor47oCOh1VLxpas+5Hui3YKEnmHn8sxKa26WuQJomnXgXfhB8n+eB/KWunWOPiBvJZDMU2C6AvP6N1MDTKvQWp2bwU5Jamt9vNjTtM=
|
16
|
+
gem: toccatore
|
17
|
+
on:
|
18
|
+
tags: true
|
19
|
+
repo: datacite/toccatore
|
data/CHANGELOG.md
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
toccatore (0.1)
|
5
|
+
activesupport (~> 4.2, >= 4.2.5)
|
6
|
+
dotenv (~> 2.1, >= 2.1.1)
|
7
|
+
gender_detector (~> 1.0)
|
8
|
+
maremma (~> 3.1)
|
9
|
+
namae (~> 0.11.0)
|
10
|
+
thor (~> 0.19)
|
11
|
+
|
12
|
+
GEM
|
13
|
+
remote: https://rubygems.org/
|
14
|
+
specs:
|
15
|
+
activesupport (4.2.7.1)
|
16
|
+
i18n (~> 0.7)
|
17
|
+
json (~> 1.7, >= 1.7.7)
|
18
|
+
minitest (~> 5.1)
|
19
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
20
|
+
tzinfo (~> 1.1)
|
21
|
+
addressable (2.5.0)
|
22
|
+
public_suffix (~> 2.0, >= 2.0.2)
|
23
|
+
builder (3.2.3)
|
24
|
+
codeclimate-test-reporter (1.0.5)
|
25
|
+
simplecov
|
26
|
+
crack (0.4.3)
|
27
|
+
safe_yaml (~> 1.0.0)
|
28
|
+
diff-lcs (1.3)
|
29
|
+
docile (1.1.5)
|
30
|
+
dotenv (2.2.0)
|
31
|
+
excon (0.45.4)
|
32
|
+
faraday (0.9.2)
|
33
|
+
multipart-post (>= 1.2, < 3)
|
34
|
+
faraday-encoding (0.0.4)
|
35
|
+
faraday
|
36
|
+
faraday_middleware (0.10.1)
|
37
|
+
faraday (>= 0.7.4, < 1.0)
|
38
|
+
gender_detector (1.0.0)
|
39
|
+
hashdiff (0.3.2)
|
40
|
+
i18n (0.7.0)
|
41
|
+
json (1.8.6)
|
42
|
+
maremma (3.1.2)
|
43
|
+
activesupport (~> 4.2, >= 4.2.5)
|
44
|
+
addressable (~> 2.5)
|
45
|
+
builder (~> 3.2, >= 3.2.2)
|
46
|
+
excon (~> 0.45.0)
|
47
|
+
faraday (~> 0.9.2)
|
48
|
+
faraday-encoding (~> 0.0.1)
|
49
|
+
faraday_middleware (~> 0.10.0)
|
50
|
+
multi_json (~> 1.11.2)
|
51
|
+
nokogiri (~> 1.6.7)
|
52
|
+
oj (~> 2.13.1)
|
53
|
+
mini_portile2 (2.1.0)
|
54
|
+
minitest (5.10.1)
|
55
|
+
multi_json (1.11.3)
|
56
|
+
multipart-post (2.0.0)
|
57
|
+
namae (0.11.3)
|
58
|
+
nokogiri (1.6.8.1)
|
59
|
+
mini_portile2 (~> 2.1.0)
|
60
|
+
oj (2.13.1)
|
61
|
+
public_suffix (2.0.5)
|
62
|
+
rack (2.0.1)
|
63
|
+
rack-test (0.6.3)
|
64
|
+
rack (>= 1.0)
|
65
|
+
rake (12.0.0)
|
66
|
+
rspec (3.5.0)
|
67
|
+
rspec-core (~> 3.5.0)
|
68
|
+
rspec-expectations (~> 3.5.0)
|
69
|
+
rspec-mocks (~> 3.5.0)
|
70
|
+
rspec-core (3.5.4)
|
71
|
+
rspec-support (~> 3.5.0)
|
72
|
+
rspec-expectations (3.5.0)
|
73
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
74
|
+
rspec-support (~> 3.5.0)
|
75
|
+
rspec-mocks (3.5.0)
|
76
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
77
|
+
rspec-support (~> 3.5.0)
|
78
|
+
rspec-support (3.5.0)
|
79
|
+
safe_yaml (1.0.4)
|
80
|
+
simplecov (0.12.0)
|
81
|
+
docile (~> 1.1.0)
|
82
|
+
json (>= 1.8, < 3)
|
83
|
+
simplecov-html (~> 0.10.0)
|
84
|
+
simplecov-html (0.10.0)
|
85
|
+
thor (0.19.4)
|
86
|
+
thread_safe (0.3.5)
|
87
|
+
tzinfo (1.2.2)
|
88
|
+
thread_safe (~> 0.1)
|
89
|
+
vcr (3.0.3)
|
90
|
+
webmock (1.24.6)
|
91
|
+
addressable (>= 2.3.6)
|
92
|
+
crack (>= 0.3.2)
|
93
|
+
hashdiff
|
94
|
+
|
95
|
+
PLATFORMS
|
96
|
+
ruby
|
97
|
+
|
98
|
+
DEPENDENCIES
|
99
|
+
bundler (~> 1.0)
|
100
|
+
codeclimate-test-reporter (~> 1.0, >= 1.0.0)
|
101
|
+
rack-test (~> 0)
|
102
|
+
rake (~> 12.0)
|
103
|
+
rspec (~> 3.4)
|
104
|
+
simplecov (~> 0.12.0)
|
105
|
+
toccatore!
|
106
|
+
vcr (~> 3.0, >= 3.0.3)
|
107
|
+
webmock (~> 1.22, >= 1.22.3)
|
108
|
+
|
109
|
+
BUNDLED WITH
|
110
|
+
1.12.5
|
data/LICENSE.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2017 DataCite
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# Toccatore
|
2
|
+
|
3
|
+
[](https://travis-ci.org/datacite/toccatore)
|
4
|
+
[](https://codeclimate.com/github/datacite/toccatore)
|
5
|
+
[](https://codeclimate.com/github/datacite/toccatore/coverage)
|
6
|
+
|
7
|
+
Command-line client for finding ORCID IDs in DataCite metadata.
|
8
|
+
|
9
|
+
## Development
|
10
|
+
|
11
|
+
We use rspec for unit testing:
|
12
|
+
|
13
|
+
```
|
14
|
+
bundle exec rspec
|
15
|
+
```
|
16
|
+
|
17
|
+
Follow along via [Github Issues](https://github.com/datacite/toccatore/issues).
|
18
|
+
|
19
|
+
### Note on Patches/Pull Requests
|
20
|
+
|
21
|
+
* Fork the project
|
22
|
+
* Write tests for your new feature or a test that reproduces a bug
|
23
|
+
* Implement your feature or make a bug fix
|
24
|
+
* Do not mess with Rakefile, version or history
|
25
|
+
* Commit, push and make a pull request. Bonus points for topical branches.
|
26
|
+
|
27
|
+
## License
|
28
|
+
**toccatore** is released under the [MIT License](https://github.com/datacite/toccatore/blob/master/LICENSE.md).
|
data/bin/toccatore
ADDED
data/lib/toccatore.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "toccatore/orcid_update"
|
@@ -0,0 +1,361 @@
|
|
1
|
+
require 'namae'
|
2
|
+
require 'gender_detector'
|
3
|
+
|
4
|
+
module Toccatore
|
5
|
+
class Base
|
6
|
+
# load ENV variables from .env file if it exists
|
7
|
+
env_file = File.expand_path("../../../.env", __FILE__)
|
8
|
+
if File.exist?(env_file)
|
9
|
+
require 'dotenv'
|
10
|
+
Dotenv.load! env_file
|
11
|
+
end
|
12
|
+
|
13
|
+
# load ENV variables from container environment if json file exists
|
14
|
+
# see https://github.com/phusion/baseimage-docker#envvar_dumps
|
15
|
+
env_json_file = "/etc/container_environment.json"
|
16
|
+
if File.exist?(env_json_file)
|
17
|
+
env_vars = JSON.parse(File.read(env_json_file))
|
18
|
+
env_vars.each { |k, v| ENV[k] = v }
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_query_url(options={})
|
22
|
+
offset = options[:offset].to_i || 0
|
23
|
+
rows = options[:rows].presence || job_batch_size
|
24
|
+
from_date = options[:from_date].presence || (Time.now.to_date - 1.day).iso8601
|
25
|
+
until_date = options[:until_date].presence || Time.now.to_date.iso8601
|
26
|
+
|
27
|
+
updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
|
28
|
+
fq = "#{updated} AND has_metadata:true AND is_active:true"
|
29
|
+
|
30
|
+
params = { q: q,
|
31
|
+
start: offset,
|
32
|
+
rows: rows,
|
33
|
+
fl: "doi,creator,title,publisher,publicationYear,resourceTypeGeneral,datacentre_symbol,relatedIdentifier,nameIdentifier,xml,minted,updated",
|
34
|
+
fq: fq,
|
35
|
+
wt: "json" }
|
36
|
+
url + URI.encode_www_form(params)
|
37
|
+
end
|
38
|
+
|
39
|
+
def get_total(options={})
|
40
|
+
query_url = get_query_url(options.merge(rows: 0))
|
41
|
+
result = Maremma.get(query_url, options)
|
42
|
+
result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
|
43
|
+
end
|
44
|
+
|
45
|
+
def queue_jobs(options={})
|
46
|
+
total = get_total(options)
|
47
|
+
|
48
|
+
if total > 0
|
49
|
+
# walk through paginated results
|
50
|
+
total_pages = (total.to_f / job_batch_size).ceil
|
51
|
+
|
52
|
+
(0...total_pages).each do |page|
|
53
|
+
options[:offset] = page * job_batch_size
|
54
|
+
process_data(options)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# return number of works queued
|
59
|
+
total
|
60
|
+
end
|
61
|
+
|
62
|
+
def process_data(options = {})
|
63
|
+
data = get_data(options.merge(timeout: timeout, source_id: source_id))
|
64
|
+
data = parse_data(data, options.merge(source_id: source_id))
|
65
|
+
|
66
|
+
# push to deposit API if no error and we have collected works and/or events
|
67
|
+
# returns hash with number of deposits created, e.g. { total: 10 }
|
68
|
+
push_data(data, options)
|
69
|
+
end
|
70
|
+
|
71
|
+
def get_data(options={})
|
72
|
+
query_url = get_query_url(options)
|
73
|
+
Maremma.get(query_url, options)
|
74
|
+
end
|
75
|
+
|
76
|
+
def parse_data(result, options={})
|
77
|
+
return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
|
78
|
+
|
79
|
+
items = result.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
|
80
|
+
get_relations_with_related_works(items)
|
81
|
+
end
|
82
|
+
|
83
|
+
# push to Lagotto deposit API if no error and we have collected works
|
84
|
+
def push_data(items, options={})
|
85
|
+
return [] if items.empty?
|
86
|
+
|
87
|
+
Array(items).map do |item|
|
88
|
+
relation = item.fetch(:relation, {})
|
89
|
+
deposit = { "deposit" => { "subj_id" => relation.fetch("subj_id", nil),
|
90
|
+
"obj_id" => relation.fetch("obj_id", nil),
|
91
|
+
"relation_type_id" => relation.fetch("relation_type_id", nil),
|
92
|
+
"source_id" => relation.fetch("source_id", nil),
|
93
|
+
"publisher_id" => relation.fetch("publisher_id", nil),
|
94
|
+
"subj" => item.fetch(:subj, {}),
|
95
|
+
"obj" => item.fetch(:obj, {}),
|
96
|
+
"message_type" => item.fetch(:message_type, "relation"),
|
97
|
+
"prefix" => item.fetch(:prefix, nil),
|
98
|
+
"source_token" => uuid } }
|
99
|
+
|
100
|
+
Maremma.post push_url, data: deposit.to_json, content_type: 'json', token: access_token
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def get_relations_with_related_works(items)
|
105
|
+
Array(items).reduce([]) do |sum, item|
|
106
|
+
doi = item.fetch("doi", nil)
|
107
|
+
prefix = doi[/^10\.\d{4,5}/]
|
108
|
+
pid = doi_as_url(doi)
|
109
|
+
type = item.fetch("resourceTypeGeneral", nil)
|
110
|
+
publisher_id = item.fetch("datacentre_symbol", nil)
|
111
|
+
|
112
|
+
xml = Base64.decode64(item.fetch('xml', "PGhzaD48L2hzaD4=\n"))
|
113
|
+
xml = Hash.from_xml(xml).fetch("resource", {})
|
114
|
+
authors = xml.fetch("creators", {}).fetch("creator", [])
|
115
|
+
authors = [authors] if authors.is_a?(Hash)
|
116
|
+
|
117
|
+
subj = { "pid" => pid,
|
118
|
+
"DOI" => doi,
|
119
|
+
"author" => get_hashed_authors(authors),
|
120
|
+
"title" => item.fetch("title", []).first,
|
121
|
+
"container-title" => item.fetch("publisher", nil),
|
122
|
+
"published" => item.fetch("publicationYear", nil),
|
123
|
+
"issued" => item.fetch("minted", nil),
|
124
|
+
"publisher_id" => publisher_id,
|
125
|
+
"registration_agency_id" => "datacite",
|
126
|
+
"tracked" => true,
|
127
|
+
"type" => type }
|
128
|
+
|
129
|
+
related_doi_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:DOI:.+/ }
|
130
|
+
sum += get_doi_relations(subj, related_doi_identifiers)
|
131
|
+
|
132
|
+
related_github_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:URL:https:\/\/github.com.+/ }
|
133
|
+
sum += get_github_relations(subj, related_github_identifiers)
|
134
|
+
|
135
|
+
name_identifiers = item.fetch('nameIdentifier', []).select { |id| id =~ /^ORCID:.+/ }
|
136
|
+
sum += get_contributions(subj, name_identifiers)
|
137
|
+
|
138
|
+
if source_id == "datacite_import"
|
139
|
+
sum += [{ prefix: prefix,
|
140
|
+
relation: { "subj_id" => subj["pid"],
|
141
|
+
"source_id" => source_id,
|
142
|
+
"publisher_id" => subj["publisher_id"],
|
143
|
+
"occurred_at" => subj["issued"] },
|
144
|
+
subj: subj }]
|
145
|
+
end
|
146
|
+
|
147
|
+
sum
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def get_github_relations(subj, items)
|
152
|
+
prefix = subj["DOI"][/^10\.\d{4,5}/]
|
153
|
+
|
154
|
+
Array(items).reduce([]) do |sum, item|
|
155
|
+
raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)
|
156
|
+
|
157
|
+
# get parent repo
|
158
|
+
# code from https://github.com/octokit/octokit.rb/blob/master/lib/octokit/repository.rb
|
159
|
+
related_identifier = PostRank::URI.clean(related_identifier)
|
160
|
+
github_hash = github_from_url(related_identifier)
|
161
|
+
owner_url = github_as_owner_url(github_hash)
|
162
|
+
repo_url = github_as_repo_url(github_hash)
|
163
|
+
|
164
|
+
sum << { prefix: prefix,
|
165
|
+
relation: { "subj_id" => subj["pid"],
|
166
|
+
"obj_id" => related_identifier,
|
167
|
+
"relation_type_id" => raw_relation_type.underscore,
|
168
|
+
"source_id" => source_id,
|
169
|
+
"publisher_id" => subj["publisher_id"],
|
170
|
+
"registration_agency_id" => "github",
|
171
|
+
"occurred_at" => subj["issued"] },
|
172
|
+
subj: subj }
|
173
|
+
|
174
|
+
# if relatedIdentifier is release URL rather than repo URL
|
175
|
+
if related_identifier != repo_url
|
176
|
+
sum << { relation: { "subj_id" => related_identifier,
|
177
|
+
"obj_id" => repo_url,
|
178
|
+
"relation_type_id" => "is_part_of",
|
179
|
+
"source_id" => source_id,
|
180
|
+
"publisher_id" => "github",
|
181
|
+
"registration_agency_id" => "github" } }
|
182
|
+
end
|
183
|
+
|
184
|
+
sum << { message_type: "contribution",
|
185
|
+
relation: { "subj_id" => owner_url,
|
186
|
+
"obj_id" => repo_url,
|
187
|
+
"source_id" => "github_contributor",
|
188
|
+
"registration_agency_id" => "github" }}
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def get_doi_relations(subj, items)
|
193
|
+
prefix = subj["DOI"][/^10\.\d{4,5}/]
|
194
|
+
|
195
|
+
Array(items).reduce([]) do |sum, item|
|
196
|
+
raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)
|
197
|
+
doi = related_identifier.strip.upcase
|
198
|
+
registration_agency = get_doi_ra(doi)
|
199
|
+
|
200
|
+
if source_id == "datacite_crossref" && registration_agency == "datacite"
|
201
|
+
sum
|
202
|
+
else
|
203
|
+
_source_id = registration_agency == "crossref" ? "datacite_crossref" : "datacite_related"
|
204
|
+
pid = doi_as_url(doi)
|
205
|
+
|
206
|
+
sum << { prefix: prefix,
|
207
|
+
relation: { "subj_id" => subj["pid"],
|
208
|
+
"obj_id" => pid,
|
209
|
+
"relation_type_id" => raw_relation_type.underscore,
|
210
|
+
"source_id" => _source_id,
|
211
|
+
"publisher_id" => subj["publisher_id"],
|
212
|
+
"registration_agency_id" => registration_agency,
|
213
|
+
"occurred_at" => subj["issued"] },
|
214
|
+
subj: subj }
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# we are flipping subj and obj for contributions
|
220
|
+
def get_contributions(obj, items)
|
221
|
+
prefix = obj["DOI"][/^10\.\d{4,5}/]
|
222
|
+
|
223
|
+
Array(items).reduce([]) do |sum, item|
|
224
|
+
orcid = item.split(':', 2).last
|
225
|
+
orcid = validate_orcid(orcid)
|
226
|
+
|
227
|
+
return sum if orcid.nil?
|
228
|
+
|
229
|
+
sum << { prefix: prefix,
|
230
|
+
message_type: "contribution",
|
231
|
+
relation: { "subj_id" => orcid_as_url(orcid),
|
232
|
+
"obj_id" => obj["pid"],
|
233
|
+
"relation_type_id" => nil,
|
234
|
+
"source_id" => source_id,
|
235
|
+
"publisher_id" => obj["publisher_id"],
|
236
|
+
"registration_agency_id" => "datacite",
|
237
|
+
"occurred_at" => obj["issued"] },
|
238
|
+
obj: obj }
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def config_fields
|
243
|
+
[:url, :push_url, :access_token]
|
244
|
+
end
|
245
|
+
|
246
|
+
def url
|
247
|
+
"https://search.datacite.org/api?"
|
248
|
+
end
|
249
|
+
|
250
|
+
def timeout
|
251
|
+
120
|
252
|
+
end
|
253
|
+
|
254
|
+
def job_batch_size
|
255
|
+
1000
|
256
|
+
end
|
257
|
+
|
258
|
+
# remove non-printing whitespace
|
259
|
+
def clean_doi(doi)
|
260
|
+
doi.gsub(/\u200B/, '')
|
261
|
+
end
|
262
|
+
|
263
|
+
def doi_from_url(url)
|
264
|
+
if /(http|https):\/\/(dx\.)?doi\.org\/(\w+)/.match(url)
|
265
|
+
uri = Addressable::URI.parse(url)
|
266
|
+
uri.path[1..-1].upcase
|
267
|
+
elsif url.starts_with?("doi:")
|
268
|
+
url[4..-1].upcase
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
def doi_as_url(doi)
|
273
|
+
Addressable::URI.encode("https://doi.org/#{clean_doi(doi)}") if doi.present?
|
274
|
+
end
|
275
|
+
|
276
|
+
def orcid_from_url(url)
|
277
|
+
Array(/\Ahttp:\/\/orcid\.org\/(.+)/.match(url)).last
|
278
|
+
end
|
279
|
+
|
280
|
+
def orcid_as_url(orcid)
|
281
|
+
"http://orcid.org/#{orcid}" if orcid.present?
|
282
|
+
end
|
283
|
+
|
284
|
+
def validate_orcid(orcid)
|
285
|
+
Array(/\A(?:http:\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(orcid)).last
|
286
|
+
end
|
287
|
+
|
288
|
+
# parse author string into CSL format
|
289
|
+
# only assume personal name when using sort-order: "Turing, Alan"
|
290
|
+
def get_one_author(author, options = {})
|
291
|
+
return { "literal" => "" } if author.strip.blank?
|
292
|
+
|
293
|
+
author = cleanup_author(author)
|
294
|
+
names = Namae.parse(author)
|
295
|
+
|
296
|
+
if names.blank? || is_personal_name?(author).blank?
|
297
|
+
{ "literal" => author }
|
298
|
+
else
|
299
|
+
name = names.first
|
300
|
+
|
301
|
+
{ "family" => name.family,
|
302
|
+
"given" => name.given }.compact
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
def cleanup_author(author)
|
307
|
+
# detect pattern "Smith J.", but not "Smith, John K."
|
308
|
+
author = author.gsub(/[[:space:]]([A-Z]\.)?(-?[A-Z]\.)$/, ', \1\2') unless author.include?(",")
|
309
|
+
|
310
|
+
# titleize strings
|
311
|
+
# remove non-standard space characters
|
312
|
+
author.my_titleize
|
313
|
+
.gsub(/[[:space:]]/, ' ')
|
314
|
+
end
|
315
|
+
|
316
|
+
def is_personal_name?(author)
|
317
|
+
return true if author.include?(",")
|
318
|
+
|
319
|
+
# lookup given name
|
320
|
+
name_detector.name_exists?(author.split.first)
|
321
|
+
end
|
322
|
+
|
323
|
+
# parse array of author strings into CSL format
|
324
|
+
def get_authors(authors, options={})
|
325
|
+
Array(authors).map { |author| get_one_author(author, options) }
|
326
|
+
end
|
327
|
+
|
328
|
+
# parse array of author hashes into CSL format
|
329
|
+
def get_hashed_authors(authors)
|
330
|
+
Array(authors).map { |author| get_one_hashed_author(author) }
|
331
|
+
end
|
332
|
+
|
333
|
+
def get_one_hashed_author(author)
|
334
|
+
raw_name = author.fetch("creatorName", nil)
|
335
|
+
|
336
|
+
author_hsh = get_one_author(raw_name)
|
337
|
+
author_hsh["ORCID"] = get_name_identifier(author)
|
338
|
+
author_hsh.compact
|
339
|
+
end
|
340
|
+
|
341
|
+
def get_name_identifier(author)
|
342
|
+
name_identifier = author.fetch("nameIdentifier", nil)
|
343
|
+
name_identifier_scheme = author.fetch("nameIdentifierScheme", "orcid").downcase
|
344
|
+
if name_identifier_scheme == "orcid" && name_identifier = validate_orcid(name_identifier)
|
345
|
+
"http://orcid.org/#{name_identifier}"
|
346
|
+
else
|
347
|
+
nil
|
348
|
+
end
|
349
|
+
end
|
350
|
+
|
351
|
+
def name_detector
|
352
|
+
GenderDetector.new
|
353
|
+
end
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
class String
|
358
|
+
def my_titleize
|
359
|
+
self.gsub(/(\b|_)(.)/) { "#{$1}#{$2.upcase}" }
|
360
|
+
end
|
361
|
+
end
|