toccatore 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +50 -0
- data/.travis.yml +19 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +110 -0
- data/LICENSE.md +21 -0
- data/README.md +28 -0
- data/bin/toccatore +5 -0
- data/lib/toccatore.rb +1 -0
- data/lib/toccatore/base.rb +361 -0
- data/lib/toccatore/cli.rb +38 -0
- data/lib/toccatore/orcid_update.rb +56 -0
- data/lib/toccatore/version.rb +3 -0
- data/spec/cli_spec.rb +29 -0
- data/spec/fixtures/orcid_update.json +1015 -0
- data/spec/fixtures/orcid_update_nil.json +12 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_CLI/orcid_update/should_fail.yml +149 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_CLI/orcid_update/should_succeed.yml +3453 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_data/should_report_if_there_are_no_works_returned_by_the_Datacite_Metadata_Search_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_data/should_report_if_there_are_works_returned_by_the_Datacite_Metadata_Search_API.yml +149 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_total/with_no_works.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/get_total/with_works.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/push_data/should_report_if_there_are_works_returned_by_the_Datacite_Metadata_Search_API.yml +3307 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/queue_jobs/should_report_if_there_are_no_works_returned_by_the_Datacite_Metadata_Search_API.yml +38 -0
- data/spec/fixtures/vcr_cassettes/Toccatore_OrcidUpdate/queue_jobs/should_report_if_there_are_works_returned_by_the_Datacite_Metadata_Search_API.yml +236 -0
- data/spec/orcid_update_spec.rb +120 -0
- data/spec/spec_helper.rb +91 -0
- data/toccatore.gemspec +37 -0
- metadata +314 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 13870f2c02d23ff37a9f4de7af84bfa353af388d
|
4
|
+
data.tar.gz: a356318891ee353f9dab772c9f74b1629a21b6dc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 63cd648459e66ce0bda3434ec311e35b12fd66d617882cb59c78a1c9091a6448c9ff2f43c0bd187d1b27c8e0cfabfd861f11325d73de8245be8419f3f79a6086
|
7
|
+
data.tar.gz: c0a6a5c484535dace3a5a677e73c97858f3f27bcab2383917d9d7ea631242abd869ed85b6d5eca275a87ab55ac072938b2a6c96093e50cdc13d3b259fea2573c
|
data/.gitignore
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/spec/examples.txt
|
9
|
+
/test/tmp/
|
10
|
+
/test/version_tmp/
|
11
|
+
/tmp/
|
12
|
+
|
13
|
+
# Used by dotenv library to load environment variables.
|
14
|
+
.env
|
15
|
+
|
16
|
+
## Specific to RubyMotion:
|
17
|
+
.dat*
|
18
|
+
.repl_history
|
19
|
+
build/
|
20
|
+
*.bridgesupport
|
21
|
+
build-iPhoneOS/
|
22
|
+
build-iPhoneSimulator/
|
23
|
+
|
24
|
+
## Specific to RubyMotion (use of CocoaPods):
|
25
|
+
#
|
26
|
+
# We recommend against adding the Pods directory to your .gitignore. However
|
27
|
+
# you should judge for yourself, the pros and cons are mentioned at:
|
28
|
+
# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
|
29
|
+
#
|
30
|
+
# vendor/Pods/
|
31
|
+
|
32
|
+
## Documentation cache and generated files:
|
33
|
+
/.yardoc/
|
34
|
+
/_yardoc/
|
35
|
+
/doc/
|
36
|
+
/rdoc/
|
37
|
+
|
38
|
+
## Environment normalization:
|
39
|
+
/.bundle/
|
40
|
+
/vendor/bundle
|
41
|
+
/lib/bundler/man/
|
42
|
+
|
43
|
+
# for a library or gem, you might want to ignore these files since the code is
|
44
|
+
# intended to run in multiple environments; otherwise, check them in:
|
45
|
+
# Gemfile.lock
|
46
|
+
# .ruby-version
|
47
|
+
# .ruby-gemset
|
48
|
+
|
49
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
50
|
+
.rvmrc
|
data/.travis.yml
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
language: ruby
|
2
|
+
cache: bundler
|
3
|
+
sudo: false
|
4
|
+
rvm:
|
5
|
+
- 2.3.1
|
6
|
+
script:
|
7
|
+
- bundle exec rspec
|
8
|
+
- bundle exec codeclimate-test-reporter
|
9
|
+
notifications:
|
10
|
+
slack: datacite:Wt8En0ALoTA6Kjc5EOKNDWxN
|
11
|
+
email: false
|
12
|
+
deploy:
|
13
|
+
provider: rubygems
|
14
|
+
api_key:
|
15
|
+
secure: eLWrKUf5as7RNOJpm4/viPR97yOpYjj7yiwblXf1axtBr/nJCi/ZZ3UyakHbsM88DI+sF52A56cum+0KkWTQuzBVR6TXB3u/UclC0z1pyjv6QClm3qLi5/lx6f//7K9FmdB130CuAUbJgUBiIyDElPJK4bE+teBUWft/Pb49Yy1/5M5F0VV/lZrOQ/O6js9cdmxxmp8DfC+UMmw1I982VGJ1xTW6vhWlZ3pA+PLi7KkdxzA5f3/SQLIC8ij6i9FLFXz37qs5ynumzDKiyshKoVZ7mVeR0SjmGAteAXDqkwmknJPMJTHxc2dvxDpZjB1KguBw6Ohs/Bv+R14bzyXepkBaZ8Mo++Ro0EqRdP9tdLbhhtJJ4+MrVPqYVL+JakAcJY3Y58e/j4ZOvbjrWFJ3oMljDpxzSUNvMvpWCQ8NlEDAhOEG3b4jbERl+vEhjYlcDVeSxBrxA02wXCoWTrZxpkRZY8qdgA3O21W+pcixEGIYT/Ox0jTfWdQUhqjJM16qcN13i6SMzeC1FaihXkA1AltUtEIgXA+uJA9aMrhHGYktMc2XkUqO4blna6ExzBwvafzZgor47oCOh1VLxpas+5Hui3YKEnmHn8sxKa26WuQJomnXgXfhB8n+eB/KWunWOPiBvJZDMU2C6AvP6N1MDTKvQWp2bwU5Jamt9vNjTtM=
|
16
|
+
gem: toccatore
|
17
|
+
on:
|
18
|
+
tags: true
|
19
|
+
repo: datacite/toccatore
|
data/CHANGELOG.md
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
toccatore (0.1)
|
5
|
+
activesupport (~> 4.2, >= 4.2.5)
|
6
|
+
dotenv (~> 2.1, >= 2.1.1)
|
7
|
+
gender_detector (~> 1.0)
|
8
|
+
maremma (~> 3.1)
|
9
|
+
namae (~> 0.11.0)
|
10
|
+
thor (~> 0.19)
|
11
|
+
|
12
|
+
GEM
|
13
|
+
remote: https://rubygems.org/
|
14
|
+
specs:
|
15
|
+
activesupport (4.2.7.1)
|
16
|
+
i18n (~> 0.7)
|
17
|
+
json (~> 1.7, >= 1.7.7)
|
18
|
+
minitest (~> 5.1)
|
19
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
20
|
+
tzinfo (~> 1.1)
|
21
|
+
addressable (2.5.0)
|
22
|
+
public_suffix (~> 2.0, >= 2.0.2)
|
23
|
+
builder (3.2.3)
|
24
|
+
codeclimate-test-reporter (1.0.5)
|
25
|
+
simplecov
|
26
|
+
crack (0.4.3)
|
27
|
+
safe_yaml (~> 1.0.0)
|
28
|
+
diff-lcs (1.3)
|
29
|
+
docile (1.1.5)
|
30
|
+
dotenv (2.2.0)
|
31
|
+
excon (0.45.4)
|
32
|
+
faraday (0.9.2)
|
33
|
+
multipart-post (>= 1.2, < 3)
|
34
|
+
faraday-encoding (0.0.4)
|
35
|
+
faraday
|
36
|
+
faraday_middleware (0.10.1)
|
37
|
+
faraday (>= 0.7.4, < 1.0)
|
38
|
+
gender_detector (1.0.0)
|
39
|
+
hashdiff (0.3.2)
|
40
|
+
i18n (0.7.0)
|
41
|
+
json (1.8.6)
|
42
|
+
maremma (3.1.2)
|
43
|
+
activesupport (~> 4.2, >= 4.2.5)
|
44
|
+
addressable (~> 2.5)
|
45
|
+
builder (~> 3.2, >= 3.2.2)
|
46
|
+
excon (~> 0.45.0)
|
47
|
+
faraday (~> 0.9.2)
|
48
|
+
faraday-encoding (~> 0.0.1)
|
49
|
+
faraday_middleware (~> 0.10.0)
|
50
|
+
multi_json (~> 1.11.2)
|
51
|
+
nokogiri (~> 1.6.7)
|
52
|
+
oj (~> 2.13.1)
|
53
|
+
mini_portile2 (2.1.0)
|
54
|
+
minitest (5.10.1)
|
55
|
+
multi_json (1.11.3)
|
56
|
+
multipart-post (2.0.0)
|
57
|
+
namae (0.11.3)
|
58
|
+
nokogiri (1.6.8.1)
|
59
|
+
mini_portile2 (~> 2.1.0)
|
60
|
+
oj (2.13.1)
|
61
|
+
public_suffix (2.0.5)
|
62
|
+
rack (2.0.1)
|
63
|
+
rack-test (0.6.3)
|
64
|
+
rack (>= 1.0)
|
65
|
+
rake (12.0.0)
|
66
|
+
rspec (3.5.0)
|
67
|
+
rspec-core (~> 3.5.0)
|
68
|
+
rspec-expectations (~> 3.5.0)
|
69
|
+
rspec-mocks (~> 3.5.0)
|
70
|
+
rspec-core (3.5.4)
|
71
|
+
rspec-support (~> 3.5.0)
|
72
|
+
rspec-expectations (3.5.0)
|
73
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
74
|
+
rspec-support (~> 3.5.0)
|
75
|
+
rspec-mocks (3.5.0)
|
76
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
77
|
+
rspec-support (~> 3.5.0)
|
78
|
+
rspec-support (3.5.0)
|
79
|
+
safe_yaml (1.0.4)
|
80
|
+
simplecov (0.12.0)
|
81
|
+
docile (~> 1.1.0)
|
82
|
+
json (>= 1.8, < 3)
|
83
|
+
simplecov-html (~> 0.10.0)
|
84
|
+
simplecov-html (0.10.0)
|
85
|
+
thor (0.19.4)
|
86
|
+
thread_safe (0.3.5)
|
87
|
+
tzinfo (1.2.2)
|
88
|
+
thread_safe (~> 0.1)
|
89
|
+
vcr (3.0.3)
|
90
|
+
webmock (1.24.6)
|
91
|
+
addressable (>= 2.3.6)
|
92
|
+
crack (>= 0.3.2)
|
93
|
+
hashdiff
|
94
|
+
|
95
|
+
PLATFORMS
|
96
|
+
ruby
|
97
|
+
|
98
|
+
DEPENDENCIES
|
99
|
+
bundler (~> 1.0)
|
100
|
+
codeclimate-test-reporter (~> 1.0, >= 1.0.0)
|
101
|
+
rack-test (~> 0)
|
102
|
+
rake (~> 12.0)
|
103
|
+
rspec (~> 3.4)
|
104
|
+
simplecov (~> 0.12.0)
|
105
|
+
toccatore!
|
106
|
+
vcr (~> 3.0, >= 3.0.3)
|
107
|
+
webmock (~> 1.22, >= 1.22.3)
|
108
|
+
|
109
|
+
BUNDLED WITH
|
110
|
+
1.12.5
|
data/LICENSE.md
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2017 DataCite
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# Toccatore
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/datacite/toccatore.svg?branch=master)](https://travis-ci.org/datacite/toccatore)
|
4
|
+
[![Code Climate](https://codeclimate.com/github/datacite/toccatore/badges/gpa.svg)](https://codeclimate.com/github/datacite/toccatore)
|
5
|
+
[![Test Coverage](https://codeclimate.com/github/datacite/toccatore/badges/coverage.svg)](https://codeclimate.com/github/datacite/toccatore/coverage)
|
6
|
+
|
7
|
+
Command-line client for finding ORCID IDs in DataCite metadata.
|
8
|
+
|
9
|
+
## Development
|
10
|
+
|
11
|
+
We use rspec for unit testing:
|
12
|
+
|
13
|
+
```
|
14
|
+
bundle exec rspec
|
15
|
+
```
|
16
|
+
|
17
|
+
Follow along via [Github Issues](https://github.com/datacite/toccatore/issues).
|
18
|
+
|
19
|
+
### Note on Patches/Pull Requests
|
20
|
+
|
21
|
+
* Fork the project
|
22
|
+
* Write tests for your new feature or a test that reproduces a bug
|
23
|
+
* Implement your feature or make a bug fix
|
24
|
+
* Do not mess with Rakefile, version or history
|
25
|
+
* Commit, push and make a pull request. Bonus points for topical branches.
|
26
|
+
|
27
|
+
## License
|
28
|
+
**toccatore** is released under the [MIT License](https://github.com/datacite/toccatore/blob/master/LICENSE.md).
|
data/bin/toccatore
ADDED
data/lib/toccatore.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "toccatore/orcid_update"
|
@@ -0,0 +1,361 @@
|
|
1
|
+
require 'namae'
|
2
|
+
require 'gender_detector'
|
3
|
+
|
4
|
+
module Toccatore
|
5
|
+
class Base
|
6
|
+
# load ENV variables from .env file if it exists
|
7
|
+
env_file = File.expand_path("../../../.env", __FILE__)
|
8
|
+
if File.exist?(env_file)
|
9
|
+
require 'dotenv'
|
10
|
+
Dotenv.load! env_file
|
11
|
+
end
|
12
|
+
|
13
|
+
# load ENV variables from container environment if json file exists
|
14
|
+
# see https://github.com/phusion/baseimage-docker#envvar_dumps
|
15
|
+
env_json_file = "/etc/container_environment.json"
|
16
|
+
if File.exist?(env_json_file)
|
17
|
+
env_vars = JSON.parse(File.read(env_json_file))
|
18
|
+
env_vars.each { |k, v| ENV[k] = v }
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_query_url(options={})
|
22
|
+
offset = options[:offset].to_i || 0
|
23
|
+
rows = options[:rows].presence || job_batch_size
|
24
|
+
from_date = options[:from_date].presence || (Time.now.to_date - 1.day).iso8601
|
25
|
+
until_date = options[:until_date].presence || Time.now.to_date.iso8601
|
26
|
+
|
27
|
+
updated = "updated:[#{from_date}T00:00:00Z TO #{until_date}T23:59:59Z]"
|
28
|
+
fq = "#{updated} AND has_metadata:true AND is_active:true"
|
29
|
+
|
30
|
+
params = { q: q,
|
31
|
+
start: offset,
|
32
|
+
rows: rows,
|
33
|
+
fl: "doi,creator,title,publisher,publicationYear,resourceTypeGeneral,datacentre_symbol,relatedIdentifier,nameIdentifier,xml,minted,updated",
|
34
|
+
fq: fq,
|
35
|
+
wt: "json" }
|
36
|
+
url + URI.encode_www_form(params)
|
37
|
+
end
|
38
|
+
|
39
|
+
def get_total(options={})
|
40
|
+
query_url = get_query_url(options.merge(rows: 0))
|
41
|
+
result = Maremma.get(query_url, options)
|
42
|
+
result.body.fetch("data", {}).fetch("response", {}).fetch("numFound", 0)
|
43
|
+
end
|
44
|
+
|
45
|
+
def queue_jobs(options={})
|
46
|
+
total = get_total(options)
|
47
|
+
|
48
|
+
if total > 0
|
49
|
+
# walk through paginated results
|
50
|
+
total_pages = (total.to_f / job_batch_size).ceil
|
51
|
+
|
52
|
+
(0...total_pages).each do |page|
|
53
|
+
options[:offset] = page * job_batch_size
|
54
|
+
process_data(options)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# return number of works queued
|
59
|
+
total
|
60
|
+
end
|
61
|
+
|
62
|
+
def process_data(options = {})
|
63
|
+
data = get_data(options.merge(timeout: timeout, source_id: source_id))
|
64
|
+
data = parse_data(data, options.merge(source_id: source_id))
|
65
|
+
|
66
|
+
# push to deposit API if no error and we have collected works and/or events
|
67
|
+
# returns hash with number of deposits created, e.g. { total: 10 }
|
68
|
+
push_data(data, options)
|
69
|
+
end
|
70
|
+
|
71
|
+
def get_data(options={})
|
72
|
+
query_url = get_query_url(options)
|
73
|
+
Maremma.get(query_url, options)
|
74
|
+
end
|
75
|
+
|
76
|
+
def parse_data(result, options={})
|
77
|
+
return result.body.fetch("errors") if result.body.fetch("errors", nil).present?
|
78
|
+
|
79
|
+
items = result.fetch("data", {}).fetch('response', {}).fetch('docs', nil)
|
80
|
+
get_relations_with_related_works(items)
|
81
|
+
end
|
82
|
+
|
83
|
+
# push to Lagotto deposit API if no error and we have collected works
|
84
|
+
def push_data(items, options={})
|
85
|
+
return [] if items.empty?
|
86
|
+
|
87
|
+
Array(items).map do |item|
|
88
|
+
relation = item.fetch(:relation, {})
|
89
|
+
deposit = { "deposit" => { "subj_id" => relation.fetch("subj_id", nil),
|
90
|
+
"obj_id" => relation.fetch("obj_id", nil),
|
91
|
+
"relation_type_id" => relation.fetch("relation_type_id", nil),
|
92
|
+
"source_id" => relation.fetch("source_id", nil),
|
93
|
+
"publisher_id" => relation.fetch("publisher_id", nil),
|
94
|
+
"subj" => item.fetch(:subj, {}),
|
95
|
+
"obj" => item.fetch(:obj, {}),
|
96
|
+
"message_type" => item.fetch(:message_type, "relation"),
|
97
|
+
"prefix" => item.fetch(:prefix, nil),
|
98
|
+
"source_token" => uuid } }
|
99
|
+
|
100
|
+
Maremma.post push_url, data: deposit.to_json, content_type: 'json', token: access_token
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def get_relations_with_related_works(items)
|
105
|
+
Array(items).reduce([]) do |sum, item|
|
106
|
+
doi = item.fetch("doi", nil)
|
107
|
+
prefix = doi[/^10\.\d{4,5}/]
|
108
|
+
pid = doi_as_url(doi)
|
109
|
+
type = item.fetch("resourceTypeGeneral", nil)
|
110
|
+
publisher_id = item.fetch("datacentre_symbol", nil)
|
111
|
+
|
112
|
+
xml = Base64.decode64(item.fetch('xml', "PGhzaD48L2hzaD4=\n"))
|
113
|
+
xml = Hash.from_xml(xml).fetch("resource", {})
|
114
|
+
authors = xml.fetch("creators", {}).fetch("creator", [])
|
115
|
+
authors = [authors] if authors.is_a?(Hash)
|
116
|
+
|
117
|
+
subj = { "pid" => pid,
|
118
|
+
"DOI" => doi,
|
119
|
+
"author" => get_hashed_authors(authors),
|
120
|
+
"title" => item.fetch("title", []).first,
|
121
|
+
"container-title" => item.fetch("publisher", nil),
|
122
|
+
"published" => item.fetch("publicationYear", nil),
|
123
|
+
"issued" => item.fetch("minted", nil),
|
124
|
+
"publisher_id" => publisher_id,
|
125
|
+
"registration_agency_id" => "datacite",
|
126
|
+
"tracked" => true,
|
127
|
+
"type" => type }
|
128
|
+
|
129
|
+
related_doi_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:DOI:.+/ }
|
130
|
+
sum += get_doi_relations(subj, related_doi_identifiers)
|
131
|
+
|
132
|
+
related_github_identifiers = item.fetch('relatedIdentifier', []).select { |id| id =~ /:URL:https:\/\/github.com.+/ }
|
133
|
+
sum += get_github_relations(subj, related_github_identifiers)
|
134
|
+
|
135
|
+
name_identifiers = item.fetch('nameIdentifier', []).select { |id| id =~ /^ORCID:.+/ }
|
136
|
+
sum += get_contributions(subj, name_identifiers)
|
137
|
+
|
138
|
+
if source_id == "datacite_import"
|
139
|
+
sum += [{ prefix: prefix,
|
140
|
+
relation: { "subj_id" => subj["pid"],
|
141
|
+
"source_id" => source_id,
|
142
|
+
"publisher_id" => subj["publisher_id"],
|
143
|
+
"occurred_at" => subj["issued"] },
|
144
|
+
subj: subj }]
|
145
|
+
end
|
146
|
+
|
147
|
+
sum
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def get_github_relations(subj, items)
|
152
|
+
prefix = subj["DOI"][/^10\.\d{4,5}/]
|
153
|
+
|
154
|
+
Array(items).reduce([]) do |sum, item|
|
155
|
+
raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)
|
156
|
+
|
157
|
+
# get parent repo
|
158
|
+
# code from https://github.com/octokit/octokit.rb/blob/master/lib/octokit/repository.rb
|
159
|
+
related_identifier = PostRank::URI.clean(related_identifier)
|
160
|
+
github_hash = github_from_url(related_identifier)
|
161
|
+
owner_url = github_as_owner_url(github_hash)
|
162
|
+
repo_url = github_as_repo_url(github_hash)
|
163
|
+
|
164
|
+
sum << { prefix: prefix,
|
165
|
+
relation: { "subj_id" => subj["pid"],
|
166
|
+
"obj_id" => related_identifier,
|
167
|
+
"relation_type_id" => raw_relation_type.underscore,
|
168
|
+
"source_id" => source_id,
|
169
|
+
"publisher_id" => subj["publisher_id"],
|
170
|
+
"registration_agency_id" => "github",
|
171
|
+
"occurred_at" => subj["issued"] },
|
172
|
+
subj: subj }
|
173
|
+
|
174
|
+
# if relatedIdentifier is release URL rather than repo URL
|
175
|
+
if related_identifier != repo_url
|
176
|
+
sum << { relation: { "subj_id" => related_identifier,
|
177
|
+
"obj_id" => repo_url,
|
178
|
+
"relation_type_id" => "is_part_of",
|
179
|
+
"source_id" => source_id,
|
180
|
+
"publisher_id" => "github",
|
181
|
+
"registration_agency_id" => "github" } }
|
182
|
+
end
|
183
|
+
|
184
|
+
sum << { message_type: "contribution",
|
185
|
+
relation: { "subj_id" => owner_url,
|
186
|
+
"obj_id" => repo_url,
|
187
|
+
"source_id" => "github_contributor",
|
188
|
+
"registration_agency_id" => "github" }}
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def get_doi_relations(subj, items)
|
193
|
+
prefix = subj["DOI"][/^10\.\d{4,5}/]
|
194
|
+
|
195
|
+
Array(items).reduce([]) do |sum, item|
|
196
|
+
raw_relation_type, _related_identifier_type, related_identifier = item.split(':', 3)
|
197
|
+
doi = related_identifier.strip.upcase
|
198
|
+
registration_agency = get_doi_ra(doi)
|
199
|
+
|
200
|
+
if source_id == "datacite_crossref" && registration_agency == "datacite"
|
201
|
+
sum
|
202
|
+
else
|
203
|
+
_source_id = registration_agency == "crossref" ? "datacite_crossref" : "datacite_related"
|
204
|
+
pid = doi_as_url(doi)
|
205
|
+
|
206
|
+
sum << { prefix: prefix,
|
207
|
+
relation: { "subj_id" => subj["pid"],
|
208
|
+
"obj_id" => pid,
|
209
|
+
"relation_type_id" => raw_relation_type.underscore,
|
210
|
+
"source_id" => _source_id,
|
211
|
+
"publisher_id" => subj["publisher_id"],
|
212
|
+
"registration_agency_id" => registration_agency,
|
213
|
+
"occurred_at" => subj["issued"] },
|
214
|
+
subj: subj }
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
# we are flipping subj and obj for contributions
|
220
|
+
def get_contributions(obj, items)
|
221
|
+
prefix = obj["DOI"][/^10\.\d{4,5}/]
|
222
|
+
|
223
|
+
Array(items).reduce([]) do |sum, item|
|
224
|
+
orcid = item.split(':', 2).last
|
225
|
+
orcid = validate_orcid(orcid)
|
226
|
+
|
227
|
+
return sum if orcid.nil?
|
228
|
+
|
229
|
+
sum << { prefix: prefix,
|
230
|
+
message_type: "contribution",
|
231
|
+
relation: { "subj_id" => orcid_as_url(orcid),
|
232
|
+
"obj_id" => obj["pid"],
|
233
|
+
"relation_type_id" => nil,
|
234
|
+
"source_id" => source_id,
|
235
|
+
"publisher_id" => obj["publisher_id"],
|
236
|
+
"registration_agency_id" => "datacite",
|
237
|
+
"occurred_at" => obj["issued"] },
|
238
|
+
obj: obj }
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def config_fields
|
243
|
+
[:url, :push_url, :access_token]
|
244
|
+
end
|
245
|
+
|
246
|
+
def url
|
247
|
+
"https://search.datacite.org/api?"
|
248
|
+
end
|
249
|
+
|
250
|
+
def timeout
|
251
|
+
120
|
252
|
+
end
|
253
|
+
|
254
|
+
def job_batch_size
|
255
|
+
1000
|
256
|
+
end
|
257
|
+
|
258
|
+
# remove non-printing whitespace
|
259
|
+
def clean_doi(doi)
|
260
|
+
doi.gsub(/\u200B/, '')
|
261
|
+
end
|
262
|
+
|
263
|
+
def doi_from_url(url)
|
264
|
+
if /(http|https):\/\/(dx\.)?doi\.org\/(\w+)/.match(url)
|
265
|
+
uri = Addressable::URI.parse(url)
|
266
|
+
uri.path[1..-1].upcase
|
267
|
+
elsif url.starts_with?("doi:")
|
268
|
+
url[4..-1].upcase
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
def doi_as_url(doi)
|
273
|
+
Addressable::URI.encode("https://doi.org/#{clean_doi(doi)}") if doi.present?
|
274
|
+
end
|
275
|
+
|
276
|
+
def orcid_from_url(url)
|
277
|
+
Array(/\Ahttp:\/\/orcid\.org\/(.+)/.match(url)).last
|
278
|
+
end
|
279
|
+
|
280
|
+
def orcid_as_url(orcid)
|
281
|
+
"http://orcid.org/#{orcid}" if orcid.present?
|
282
|
+
end
|
283
|
+
|
284
|
+
def validate_orcid(orcid)
|
285
|
+
Array(/\A(?:http:\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(orcid)).last
|
286
|
+
end
|
287
|
+
|
288
|
+
# parse author string into CSL format
|
289
|
+
# only assume personal name when using sort-order: "Turing, Alan"
|
290
|
+
def get_one_author(author, options = {})
|
291
|
+
return { "literal" => "" } if author.strip.blank?
|
292
|
+
|
293
|
+
author = cleanup_author(author)
|
294
|
+
names = Namae.parse(author)
|
295
|
+
|
296
|
+
if names.blank? || is_personal_name?(author).blank?
|
297
|
+
{ "literal" => author }
|
298
|
+
else
|
299
|
+
name = names.first
|
300
|
+
|
301
|
+
{ "family" => name.family,
|
302
|
+
"given" => name.given }.compact
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
def cleanup_author(author)
|
307
|
+
# detect pattern "Smith J.", but not "Smith, John K."
|
308
|
+
author = author.gsub(/[[:space:]]([A-Z]\.)?(-?[A-Z]\.)$/, ', \1\2') unless author.include?(",")
|
309
|
+
|
310
|
+
# titleize strings
|
311
|
+
# remove non-standard space characters
|
312
|
+
author.my_titleize
|
313
|
+
.gsub(/[[:space:]]/, ' ')
|
314
|
+
end
|
315
|
+
|
316
|
+
def is_personal_name?(author)
|
317
|
+
return true if author.include?(",")
|
318
|
+
|
319
|
+
# lookup given name
|
320
|
+
name_detector.name_exists?(author.split.first)
|
321
|
+
end
|
322
|
+
|
323
|
+
# parse array of author strings into CSL format
|
324
|
+
def get_authors(authors, options={})
|
325
|
+
Array(authors).map { |author| get_one_author(author, options) }
|
326
|
+
end
|
327
|
+
|
328
|
+
# parse array of author hashes into CSL format
|
329
|
+
def get_hashed_authors(authors)
|
330
|
+
Array(authors).map { |author| get_one_hashed_author(author) }
|
331
|
+
end
|
332
|
+
|
333
|
+
def get_one_hashed_author(author)
|
334
|
+
raw_name = author.fetch("creatorName", nil)
|
335
|
+
|
336
|
+
author_hsh = get_one_author(raw_name)
|
337
|
+
author_hsh["ORCID"] = get_name_identifier(author)
|
338
|
+
author_hsh.compact
|
339
|
+
end
|
340
|
+
|
341
|
+
def get_name_identifier(author)
|
342
|
+
name_identifier = author.fetch("nameIdentifier", nil)
|
343
|
+
name_identifier_scheme = author.fetch("nameIdentifierScheme", "orcid").downcase
|
344
|
+
if name_identifier_scheme == "orcid" && name_identifier = validate_orcid(name_identifier)
|
345
|
+
"http://orcid.org/#{name_identifier}"
|
346
|
+
else
|
347
|
+
nil
|
348
|
+
end
|
349
|
+
end
|
350
|
+
|
351
|
+
def name_detector
|
352
|
+
GenderDetector.new
|
353
|
+
end
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
class String
|
358
|
+
def my_titleize
|
359
|
+
self.gsub(/(\b|_)(.)/) { "#{$1}#{$2.upcase}" }
|
360
|
+
end
|
361
|
+
end
|