uc3-dmp-id 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/uc3-dmp-id/asserter.rb +8 -2
- data/lib/uc3-dmp-id/comparator.rb +531 -0
- data/lib/uc3-dmp-id/version.rb +1 -1
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 10840ba39949ec387dd3016a717d75cf3ba5d259fe164fb54e9c2d436e74a24b
|
4
|
+
data.tar.gz: 398c7de1d549a738bfc574cdf56c71b1bff8400402aee5bc8b356fdcb9f01ecb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 793eeaf24f53e8a77e36596d91c56501f125f8d9272bd6b432dd05bc2d170d907d25fe213c276cf614ed8118884ac0be796cfa53e3d02c88ed91bf8660ab2f53
|
7
|
+
data.tar.gz: 8214ca8f136883170f45b025bc80e73e047ad2a6172837e08ab8f1b3dc48049d3038a95f4df278958c7b3d6a5b5b938a988ec88c3bb556bb0791c998c2603200
|
data/lib/uc3-dmp-id/asserter.rb
CHANGED
@@ -168,7 +168,8 @@ module Uc3DmpId
|
|
168
168
|
# "id": "ABCD1234",
|
169
169
|
# "provenance": "dmphub",
|
170
170
|
# "timestamp": "2023-07-07T14:50:23+00:00",
|
171
|
-
# "note": "
|
171
|
+
# "note": "Data received from OpenAlex, matched by PI names and title keywords.",
|
172
|
+
# "confiedence": "Med",
|
172
173
|
# "dmproadmap_related_identifiers": {
|
173
174
|
# "work_type": "article",
|
174
175
|
# "descriptor": "is_cited_by",
|
@@ -183,7 +184,8 @@ module Uc3DmpId
|
|
183
184
|
# "id": "ABCD1234",
|
184
185
|
# "provenance": "dmphub",
|
185
186
|
# "timestamp": "2023-07-07T14:50:23+00:00",
|
186
|
-
# "note": "
|
187
|
+
# "note": "Data received from the NIH API, matched by the opportunity number.",
|
188
|
+
# "confidence": "High",
|
187
189
|
# "funding": {
|
188
190
|
# "funding_status": "granted",
|
189
191
|
# "grant_id": {
|
@@ -206,5 +208,9 @@ module Uc3DmpId
|
|
206
208
|
JSON.parse(assertion.to_json)
|
207
209
|
end
|
208
210
|
end
|
211
|
+
|
212
|
+
def _score_related_work(latest_version:, work:); end
|
213
|
+
|
214
|
+
def _score_funding(latest_version:, funding:); end
|
209
215
|
end
|
210
216
|
end
|
@@ -0,0 +1,531 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'text'
|
4
|
+
|
5
|
+
# rubocop:disable Metrics/ClassLength
|
6
|
+
module Uc3DmpId
|
7
|
+
class ComparatorError < StandardError; end
|
8
|
+
|
9
|
+
# Class that compares incoming data from an external source to the DMP
|
10
|
+
# It determines if they are likely related and applies a confidence rating
|
11
|
+
class Comparator
|
12
|
+
MSG_MISSING_AUGMENTER = 'No Augmenter specified!'
|
13
|
+
MSG_MISSING_DMP = 'No DMP or the DMP did not contain enough information to use.'
|
14
|
+
|
15
|
+
STOP_WORDS = %w[a an and if of or the then they].freeze
|
16
|
+
|
17
|
+
# See the bottom of this file for a hard-coded crosswalk between Crossref funder ids and ROR ids
|
18
|
+
# Some APIs do not support ROR fully for funder ids, so we need to be able to reference both
|
19
|
+
|
20
|
+
attr_accessor :augmenter, :dmp, :details_hash, :logger
|
21
|
+
|
22
|
+
# rubocop:disable Metrics/AbcSize
|
23
|
+
def initialize(**args)
|
24
|
+
@logger = args[:logger]
|
25
|
+
@details_hash = {}
|
26
|
+
|
27
|
+
@augmenter = args[:augmenter]
|
28
|
+
raise ComparatorError, MSG_MISSING_AUGMENTER if @augmenter.nil? ||
|
29
|
+
!@augmenter['PK']&.start_with?('AUGMENTERS#')
|
30
|
+
|
31
|
+
@dmp = args.fetch(:dmp, {})['dmp'].nil? ? args[:dmp] : args.fetch(:dmp, {})['dmp']
|
32
|
+
_extract_dmp_details(dmp:)
|
33
|
+
raise ComparatorError, MSG_MISSING_DMP if @details_hash.empty?
|
34
|
+
end
|
35
|
+
# rubocop:enable Metrics/AbcSize
|
36
|
+
|
37
|
+
# Compare the incoming hash with the DMP details that were gathered during initialization.
|
38
|
+
#
|
39
|
+
# The Hash should contain:
|
40
|
+
# {
|
41
|
+
# title: "Example research project",
|
42
|
+
# abstract: "Lorem ipsum psuedo abstract",
|
43
|
+
# keywords: ["foo", "bar"],z
|
44
|
+
# people: [
|
45
|
+
# {
|
46
|
+
# id: "https://orcid.org/blah",
|
47
|
+
# last_name: "doe",
|
48
|
+
# affiliation: { id: "https://ror.org/blah", name: "Foo" }
|
49
|
+
# }
|
50
|
+
# ],
|
51
|
+
# fundings: [
|
52
|
+
# { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
|
53
|
+
# ],
|
54
|
+
# repositories: [
|
55
|
+
# { id: ["http://some.repo.org", "https://doi.org/re3data123"], name: "Repo" }
|
56
|
+
# ]
|
57
|
+
# }
|
58
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
59
|
+
def compare(hash:)
|
60
|
+
response = { confidence: 'None', score: 0, notes: [], source: @augmenter['name'] }
|
61
|
+
return response unless hash.is_a?(Hash) && !hash['title'].nil?
|
62
|
+
|
63
|
+
# Compare the grant ids. If we have a match return the response immediately since that is
|
64
|
+
# a very positive match!
|
65
|
+
response = _grants_match?(array: hash['fundings'], response:)
|
66
|
+
return response if response[:confidence] != 'None'
|
67
|
+
|
68
|
+
response = _opportunities_match?(array: hash['fundings'], response:)
|
69
|
+
response = _orcids_match?(array: hash['people'], response:)
|
70
|
+
response = _last_name_and_affiliation_match?(array: hash['people'], response:)
|
71
|
+
|
72
|
+
# Only process the following if we had some matching contributors, affiliations or opportuniy nbrs
|
73
|
+
response = _repository_match?(array: hash['repositories'], response:) if response[:score].positive?
|
74
|
+
response = _keyword_match?(array: hash['repositories'], response:) if response[:score].positive?
|
75
|
+
response = _text_match?(type: 'title', text: hash['title'], response:) if response[:score].positive?
|
76
|
+
response = _text_match?(type: 'abstract', text: hash['abstract'], response:) if response[:score].positive?
|
77
|
+
# If the score is less than 3 then we have no confidence that it is a match
|
78
|
+
return response if response[:score] <= 2
|
79
|
+
|
80
|
+
# Set the confidence level based on the score
|
81
|
+
response[:confidence] = if response[:score] > 15
|
82
|
+
'High'
|
83
|
+
else
|
84
|
+
(response[:score] > 10 ? 'Medium' : 'Low')
|
85
|
+
end
|
86
|
+
response
|
87
|
+
end
|
88
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
93
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
94
|
+
def _extract_dmp_details(dmp:)
|
95
|
+
return nil unless dmp.is_a?(Hash) && !dmp['title'].nil? && !dmp['contact'].nil?
|
96
|
+
|
97
|
+
projects = dmp.fetch('project', [{}])
|
98
|
+
fundings = projects.map { |proj| proj.fetch('funding', []) }.flatten.compact.uniq
|
99
|
+
hosts = dmp.fetch('dataset', []).map { |dset| dset.fetch('distribution', []).map { |d| d['host'] } }
|
100
|
+
people = [dmp['contact']]
|
101
|
+
people << dmp.fetch('contributor', [])
|
102
|
+
|
103
|
+
# Extract all of the important bits about the DMP
|
104
|
+
@details_hash = {
|
105
|
+
created: dmp.fetch('created', Time.now.iso8601),
|
106
|
+
title: _cleanse_text(text: projects&.first&.fetch('title', dmp['title'])),
|
107
|
+
abstract: _cleanse_text(text: projects&.first&.fetch('description', dmp['description'])),
|
108
|
+
keywords: dmp.fetch('dataset', []).map { |ds| ds.fetch('keyword', []) }.flatten.compact.uniq,
|
109
|
+
identifiers: [dmp.fetch('dmp_id', {})['identifier']],
|
110
|
+
last_names: [],
|
111
|
+
affiliation_ids: [],
|
112
|
+
affiliations: [],
|
113
|
+
funder_names: [],
|
114
|
+
funder_ids: [],
|
115
|
+
opportunity_ids: [],
|
116
|
+
grant_ids: [],
|
117
|
+
repositories: []
|
118
|
+
}
|
119
|
+
_extract_people(array: people&.flatten&.compact&.uniq)
|
120
|
+
_extract_funding(array: fundings)
|
121
|
+
_extract_repositories(repos: hosts.flatten.compact.uniq)
|
122
|
+
|
123
|
+
# Clean up the results by flattening and removing duplicates from the Arrays
|
124
|
+
@details_hash.each_key do |key|
|
125
|
+
@details_hash[key] = @details_hash[key].flatten.compact.uniq if @details_hash[key].is_a?(Array)
|
126
|
+
end
|
127
|
+
@logger&.debug(message: 'Extracted the following from the DMP', details: @details_hash)
|
128
|
+
end
|
129
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
130
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
131
|
+
|
132
|
+
# Extract all of the funding information
|
133
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
134
|
+
def _extract_funding(array:)
|
135
|
+
return [] unless array.is_a?(Array)
|
136
|
+
|
137
|
+
array.each do |funding|
|
138
|
+
next unless funding.is_a?(Hash)
|
139
|
+
|
140
|
+
funder_id = funding.fetch('funder_id', {})
|
141
|
+
ror = funder_id['identifier'] if funder_id['type']&.downcase&.strip == 'ror'
|
142
|
+
fundref = ror.nil? ? funder_id['identifier']&.downcase&.strip : ROR_FUNDREF_ID_CROSSWALK[:"#{ror}"]
|
143
|
+
opportunity = funding.fetch('dmproadmap_funding_opportunity_id', {})['identifier']
|
144
|
+
grant = funding.fetch('grant_id', {})['identifier']
|
145
|
+
|
146
|
+
@details_hash[:identifiers] << ror&.downcase&.strip
|
147
|
+
@details_hash[:identifiers] << fundref&.downcase&.strip
|
148
|
+
@details_hash[:identifiers] << grant&.downcase&.strip
|
149
|
+
@details_hash[:identifiers] << grant&.split('/')&.last&.downcase&.strip
|
150
|
+
@details_hash[:identifiers] << opportunity&.downcase&.strip
|
151
|
+
|
152
|
+
@details_hash[:funder_names] << funding['name']&.downcase&.strip
|
153
|
+
@details_hash[:funder_ids] << fundref
|
154
|
+
@details_hash[:opportunity_ids] << opportunity&.downcase&.strip
|
155
|
+
@details_hash[:grant_ids] << [grant&.downcase&.strip, grant&.split('/')&.last&.downcase&.strip]
|
156
|
+
end
|
157
|
+
array
|
158
|
+
end
|
159
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
160
|
+
|
161
|
+
# Extract all of the ORCIDs, last names, and affiliation ids and names
|
162
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
163
|
+
def _extract_people(array:)
|
164
|
+
return [] unless array.is_a?(Array)
|
165
|
+
|
166
|
+
array.each do |entry|
|
167
|
+
next unless entry.is_a?(Hash)
|
168
|
+
|
169
|
+
id = entry.fetch('contributor_id', entry.fetch('contact_id', {}))['identifier']&.downcase&.strip
|
170
|
+
affil = entry.fetch('dmproadmap_affiliation', {})
|
171
|
+
ror = affil.fetch('affiliation_id', {})['identifier']&.downcase&.strip
|
172
|
+
name = entry.fetch('name', '')&.downcase&.strip
|
173
|
+
last_name = name.include?(', ') ? name.split(', ').first : name.split.last
|
174
|
+
|
175
|
+
@details_hash[:identifiers] << [id, ror&.downcase&.strip]
|
176
|
+
@details_hash[:last_names] << last_name
|
177
|
+
@details_hash[:affiliation_ids] << ror
|
178
|
+
@details_hash[:affiliations] << affil.fetch('name', '')&.split(' (')&.first&.downcase&.strip
|
179
|
+
end
|
180
|
+
array
|
181
|
+
end
|
182
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
183
|
+
|
184
|
+
# Extract all of the re3data ids, URLs and names
|
185
|
+
# rubocop:disable Metrics/AbcSize
|
186
|
+
def _extract_repositories(repos:)
|
187
|
+
return [] unless repos.is_a?(Array)
|
188
|
+
|
189
|
+
repos.each do |repo|
|
190
|
+
next unless repo.is_a?(Hash)
|
191
|
+
|
192
|
+
@details_hash[:identifiers] << [
|
193
|
+
repo['url']&.downcase&.strip, repo.fetch('dmproadmap_host_id', {})['identifier']&.downcase&.strip
|
194
|
+
]
|
195
|
+
@details_hash[:repositories] << repo.fetch('name', '')&.downcase&.strip
|
196
|
+
end
|
197
|
+
repos
|
198
|
+
end
|
199
|
+
# rubocop:enable Metrics/AbcSize
|
200
|
+
|
201
|
+
# Returns whether or not the incoming grant id(s) match the DMPs grant id. Expecting:
|
202
|
+
# [
|
203
|
+
# { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
|
204
|
+
# ]
|
205
|
+
# rubocop:disable Metrics/AbcSize
|
206
|
+
def _grants_match?(array:, response:)
|
207
|
+
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
208
|
+
|
209
|
+
ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
|
210
|
+
.map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
|
211
|
+
.flatten.compact.uniq
|
212
|
+
|
213
|
+
matched = _compare_arrays(array_a: @details_hash.fetch(:grant_ids, []), array_b: ids)
|
214
|
+
return response if matched <= 0
|
215
|
+
|
216
|
+
response[:confidence] = 'Absolute'
|
217
|
+
response[:score] = 100
|
218
|
+
response[:notes] << 'the grant ID matched'
|
219
|
+
response
|
220
|
+
end
|
221
|
+
# rubocop:enable Metrics/AbcSize
|
222
|
+
|
223
|
+
# Returns whether or not the incoming grant id(s) match the DMPs opportunity id. Expecting:
|
224
|
+
# [
|
225
|
+
# { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
|
226
|
+
# ]
|
227
|
+
# rubocop:disable Metrics/AbcSize
|
228
|
+
def _opportunities_match?(array:, response:)
|
229
|
+
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
230
|
+
|
231
|
+
ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
|
232
|
+
.map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
|
233
|
+
.flatten.compact.uniq
|
234
|
+
|
235
|
+
matched = _compare_arrays(array_a: @details_hash.fetch(:opportunity_ids, []), array_b: ids)
|
236
|
+
return response if matched <= 0
|
237
|
+
|
238
|
+
response[:score] += 5
|
239
|
+
response[:notes] << 'the funding opportunity number matched'
|
240
|
+
response
|
241
|
+
end
|
242
|
+
# rubocop:enable Metrics/AbcSize
|
243
|
+
|
244
|
+
# Returns whether or not the inciming list of creators/contributors match those on the DMP. Expecting:
|
245
|
+
# [
|
246
|
+
# {
|
247
|
+
# id: "https://orcid.org/blah",
|
248
|
+
# last_name: "doe",
|
249
|
+
# affiliation: { id: "https://ror.org/blah", name: "Foo" }
|
250
|
+
# }
|
251
|
+
# ]
|
252
|
+
# rubocop:disable Metrics/AbcSize
|
253
|
+
def _orcids_match?(array:, response:)
|
254
|
+
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
255
|
+
|
256
|
+
ids = array.select { |repo| repo.is_a?(Hash) }
|
257
|
+
.map { |person| person['id']&.downcase&.strip }
|
258
|
+
.flatten.compact.uniq
|
259
|
+
|
260
|
+
matched = _compare_arrays(array_a: @details_hash.fetch(:identifiers, []), array_b: ids)
|
261
|
+
return response if matched <= 0
|
262
|
+
|
263
|
+
response[:score] += (matched * 2)
|
264
|
+
response[:notes] << 'contributor ORCIDs matched'
|
265
|
+
response
|
266
|
+
end
|
267
|
+
# rubocop:enable Metrics/AbcSize
|
268
|
+
|
269
|
+
# Returns whether or not the inciming list of creators/contributors match those on the DMP. Expecting:
|
270
|
+
# [
|
271
|
+
# {
|
272
|
+
# id: "https://orcid.org/blah",
|
273
|
+
# last_name: "doe",
|
274
|
+
# affiliation: { id: "https://ror.org/blah", name: "Foo" }
|
275
|
+
# }
|
276
|
+
# ]
|
277
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
278
|
+
def _last_name_and_affiliation_match?(array:, response:)
|
279
|
+
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
280
|
+
|
281
|
+
array = array.select { |repo| repo.is_a?(Hash) }
|
282
|
+
affiliations = array.map { |person| person['affiliation'] }&.flatten&.compact&.uniq
|
283
|
+
last_names = array.map { |person| person['last_name']&.downcase&.strip }&.flatten&.compact&.uniq
|
284
|
+
rors = affiliations.map { |affil| affil['id']&.downcase&.strip }&.flatten&.compact&.uniq
|
285
|
+
affil_names = affiliations.map { |affil| affil['name']&.downcase&.strip }&.flatten&.compact&.uniq
|
286
|
+
|
287
|
+
# Check the person last names and affiliation name and RORs
|
288
|
+
last_names_matched = _compare_arrays(array_a: @details_hash.fetch(:last_names, []), array_b: last_names)
|
289
|
+
rors_matched = _compare_arrays(array_a: @details_hash.fetch(:affiliation_ids, []), array_b: rors)
|
290
|
+
affil_names_matched = _compare_arrays(array_a: @details_hash.fetch(:affiliations, []), array_b: affil_names)
|
291
|
+
return response if last_names_matched <= 0 && rors_matched <= 0 && affil_names_matched <= 0
|
292
|
+
|
293
|
+
response[:score] += last_names_matched + rors_matched + affil_names_matched
|
294
|
+
response[:notes] << 'contributor names and affiliations matched'
|
295
|
+
response
|
296
|
+
end
|
297
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
298
|
+
|
299
|
+
# Returns whether or not the incoming list of repositories match those defined in the DMP. Expecting:
|
300
|
+
# [
|
301
|
+
# { id: ["http://some.repo.org", "https://doi.org/re3data123"], name: "Repo" }
|
302
|
+
# ]
|
303
|
+
# rubocop:disable Metrics/AbcSize
|
304
|
+
def _repository_match?(array:, response:)
|
305
|
+
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
306
|
+
|
307
|
+
# We only care about repositories with ids/urls
|
308
|
+
ids = array.select { |repo| repo.is_a?(Hash) }
|
309
|
+
.map { |repo| repo['id'].map { |id| id&.downcase&.strip } }
|
310
|
+
.flatten.compact.uniq
|
311
|
+
|
312
|
+
matched = _compare_arrays(array_a: @details_hash.fetch(:identifiers, []), array_b: ids)
|
313
|
+
return response if matched <= 0
|
314
|
+
|
315
|
+
response[:score] += matched
|
316
|
+
response[:notes] << 'repositories matched'
|
317
|
+
response
|
318
|
+
end
|
319
|
+
# rubocop:enable Metrics/AbcSize
|
320
|
+
|
321
|
+
# Returns whether or not the list of keywords exist in the DMP. Expecting:
|
322
|
+
# keywords: ["foo", "bar"]
|
323
|
+
def _keyword_match?(array:, response:)
|
324
|
+
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
325
|
+
|
326
|
+
keywords = array.map { |word| word&.downcase&.strip }&.flatten&.compact&.uniq
|
327
|
+
matched = _compare_arrays(array_a: @details_hash.fetch(:keywords, []), array_b: keywords)
|
328
|
+
return response if matched <= 0
|
329
|
+
|
330
|
+
response[:score] += 1
|
331
|
+
response[:notes] << 'keywords matched'
|
332
|
+
response
|
333
|
+
end
|
334
|
+
|
335
|
+
# Uses an NLP library to determine if the :text matches the DMP/Project :title or :description
|
336
|
+
# rubocop:disable Metrics/AbcSize
|
337
|
+
def _text_match?(text:, response:, type: 'title')
|
338
|
+
return response unless response.is_a?(Hash) && text.is_a?(String) && !text.strip.empty? &&
|
339
|
+
!@details_hash[type.to_sym].nil?
|
340
|
+
|
341
|
+
nlp_processor = Text::WhiteSimilarity.new
|
342
|
+
cleansed = _cleanse_text(text:)
|
343
|
+
|
344
|
+
details = {
|
345
|
+
"dmp_#{type}": @details_hash[type.to_sym],
|
346
|
+
"incoming_#{type}": cleansed,
|
347
|
+
nlp_score: nlp_processor.similarity(@details_hash[type.to_sym], cleansed)
|
348
|
+
}
|
349
|
+
@logger&.debug(message: 'Text::WhiteSimilarity score', details:)
|
350
|
+
return response if details[:nlp_score] < 0.5
|
351
|
+
|
352
|
+
response[:score] += details[:nlp_score] >= 0.75 ? 5 : 2
|
353
|
+
response[:notes] << "#{type}s are similar"
|
354
|
+
response
|
355
|
+
end
|
356
|
+
# rubocop:enable Metrics/AbcSize
|
357
|
+
|
358
|
+
# Change the incoming text to lower case, remove spaces and STOP_WORDS
|
359
|
+
def _cleanse_text(text:)
|
360
|
+
return nil unless text.is_a?(String)
|
361
|
+
|
362
|
+
text.downcase.split.reject { |word| STOP_WORDS.include?(word) }.join(' ').strip
|
363
|
+
end
|
364
|
+
|
365
|
+
# Do an introspection of the 2 arrays and return the number of matches
|
366
|
+
def _compare_arrays(array_a: [], array_b: [])
|
367
|
+
return 0 unless array_a.is_a?(Array) && array_b.is_a?(Array)
|
368
|
+
|
369
|
+
intersection = array_a & array_b
|
370
|
+
intersection.nil? || intersection.size <= 0 ? 0 : intersection.size
|
371
|
+
end
|
372
|
+
|
373
|
+
# TODO: Remove this hard-coded crosswalk once the community has broader support for using ROR for funder ids
|
374
|
+
ROR_FUNDREF_ID_CROSSWALK = {
|
375
|
+
# NIH ID Crosswalk
|
376
|
+
'https://ror.org/01cwqze88': 'https://doi.org/10.13039/100000002',
|
377
|
+
'https://ror.org/04mhx6838': 'https://doi.org/10.13039/100000055',
|
378
|
+
'https://ror.org/012pb6c26': 'https://doi.org/10.13039/100000050',
|
379
|
+
'https://ror.org/03wkg3b53': 'https://doi.org/10.13039/100000053',
|
380
|
+
'https://ror.org/0060t0j89': 'https://doi.org/10.13039/100000092',
|
381
|
+
'https://ror.org/00372qc85': 'https://doi.org/10.13039/100000070',
|
382
|
+
'https://ror.org/00190t495': 'https://doi.org/10.13039/100008460',
|
383
|
+
'https://ror.org/00j4k1h63': 'https://doi.org/10.13039/100000066',
|
384
|
+
'https://ror.org/01y3zfr79': 'https://doi.org/10.13039/100000056',
|
385
|
+
'https://ror.org/04q48ey07': 'https://doi.org/10.13039/100000057',
|
386
|
+
'https://ror.org/0493hgw16': 'https://doi.org/10.13039/100006545',
|
387
|
+
'https://ror.org/04vfsmv21': 'https://doi.org/10.13039/100000098',
|
388
|
+
'https://ror.org/03jh5a977': 'https://doi.org/10.13039/100000093',
|
389
|
+
'https://ror.org/04xeg9z08': 'https://doi.org/10.13039/100000025',
|
390
|
+
'https://ror.org/01s5ya894': 'https://doi.org/10.13039/100000065',
|
391
|
+
'https://ror.org/02meqm098': 'https://doi.org/10.13039/100000002',
|
392
|
+
'https://ror.org/049v75w11': 'https://doi.org/10.13039/100000049',
|
393
|
+
'https://ror.org/004a2wv92': 'https://doi.org/10.13039/100000072',
|
394
|
+
'https://ror.org/00adh9b73': 'https://doi.org/10.13039/100000062',
|
395
|
+
'https://ror.org/043z4tv69': 'https://doi.org/10.13039/100000060',
|
396
|
+
'https://ror.org/00x19de83': 'https://doi.org/10.13039/100000002',
|
397
|
+
'https://ror.org/02jzrsm59': 'https://doi.org/10.13039/100000027',
|
398
|
+
'https://ror.org/006zn3t30': 'https://doi.org/10.13039/100000069',
|
399
|
+
'https://ror.org/04byxyr05': 'https://doi.org/10.13039/100000071',
|
400
|
+
'https://ror.org/04pw6fb54': 'https://doi.org/10.13039/100006108',
|
401
|
+
'https://ror.org/05aq6yn88': 'https://doi.org/10.13039/100006955',
|
402
|
+
'https://ror.org/02xey9a22': 'https://doi.org/10.13039/100000061',
|
403
|
+
'https://ror.org/00fj8a872': 'https://doi.org/10.13039/100000052',
|
404
|
+
'https://ror.org/01wtjyf13': 'https://doi.org/10.13039/100000063',
|
405
|
+
'https://ror.org/04r5s4b52': 'https://doi.org/10.13039/100005440',
|
406
|
+
'https://ror.org/046zezr58': 'https://doi.org/10.13039/100006085',
|
407
|
+
'https://ror.org/02e3wq066': 'https://doi.org/10.13039/100006086',
|
408
|
+
'https://ror.org/031gy6182': 'https://doi.org/10.13039/100000002',
|
409
|
+
'https://ror.org/054j5yq82': 'https://doi.org/10.13039/100000002',
|
410
|
+
'https://ror.org/02yrzyf97': 'https://doi.org/10.13039/100000002',
|
411
|
+
|
412
|
+
# NSF ID Crosswalk
|
413
|
+
'https://.org/021nxhr62': 'https://doi.org/10.13039/100000001',
|
414
|
+
'https://.org/04aqat463': 'https://doi.org/10.13039/100000001',
|
415
|
+
'https://.org/01rcfpa16': 'https://doi.org/10.13039/100005441',
|
416
|
+
'https://.org/014eweh95': 'https://doi.org/10.13039/100005445',
|
417
|
+
'https://.org/001xhss06': 'https://doi.org/10.13039/100000076',
|
418
|
+
'https://.org/04qn9mx93': 'https://doi.org/10.13039/100000153',
|
419
|
+
'https://.org/03g87he71': 'https://doi.org/10.13039/100000155',
|
420
|
+
'https://.org/01tnvpc68': 'https://doi.org/10.13039/100000156',
|
421
|
+
'https://.org/01rvays47': 'https://doi.org/10.13039/100000154',
|
422
|
+
'https://.org/002jdaq33': 'https://doi.org/10.13039/100000152',
|
423
|
+
'https://.org/025kzpk63': 'https://doi.org/10.13039/100000083',
|
424
|
+
'https://.org/04nh1dc89': 'https://doi.org/10.13039/100007523',
|
425
|
+
'https://.org/01mng8331': 'https://doi.org/10.13039/100000143',
|
426
|
+
'https://.org/02rdzmk74': 'https://doi.org/10.13039/100000144',
|
427
|
+
'https://.org/053a2cp42': 'https://doi.org/10.13039/100000145',
|
428
|
+
'https://.org/014bj5w56': 'https://doi.org/10.13039/100000081',
|
429
|
+
'https://.org/00whkrf32': 'https://doi.org/10.13039/100000082',
|
430
|
+
'https://.org/05s7cqk18': 'https://doi.org/10.13039/100000173',
|
431
|
+
'https://.org/02kd4km72': 'https://doi.org/10.13039/100000172',
|
432
|
+
'https://.org/03mamvh39': 'https://doi.org/10.13039/100000171',
|
433
|
+
'https://.org/00b6sbb32': 'https://doi.org/10.13039/100000084',
|
434
|
+
'https://.org/0471zv972': 'https://doi.org/10.13039/100000146',
|
435
|
+
'https://.org/028yd4c30': 'https://doi.org/10.13039/100000147',
|
436
|
+
'https://.org/01krpsy48': 'https://doi.org/10.13039/100000148',
|
437
|
+
'https://.org/050rnw378': 'https://doi.org/10.13039/100000149',
|
438
|
+
'https://.org/0388pet74': 'https://doi.org/10.13039/100000150',
|
439
|
+
'https://.org/03xyg3m20': 'https://doi.org/10.13039/100000151',
|
440
|
+
'https://.org/05p847d66': 'https://doi.org/10.13039/100000085',
|
441
|
+
'https://.org/037gd6g64': 'https://doi.org/10.13039/100000159',
|
442
|
+
'https://.org/05v01mk25': 'https://doi.org/10.13039/100000160',
|
443
|
+
'https://.org/05wqqhv83': 'https://doi.org/10.13039/100000141',
|
444
|
+
'https://.org/05nwjp114': 'https://doi.org/10.13039/100007352',
|
445
|
+
'https://.org/05fnzca26': 'https://doi.org/10.13039/100000162',
|
446
|
+
'https://.org/02trddg58': 'https://doi.org/10.13039/100000163',
|
447
|
+
'https://.org/029b7h395': 'https://doi.org/10.13039/100000086',
|
448
|
+
'https://.org/04mg8wm74': 'https://doi.org/10.13039/100000164',
|
449
|
+
'https://.org/01ar8dr59': 'https://doi.org/10.13039/100000165',
|
450
|
+
'https://.org/01pc7k308': 'https://doi.org/10.13039/100000078',
|
451
|
+
'https://.org/051fftw81': 'https://doi.org/10.13039/100000121',
|
452
|
+
'https://.org/04ap5x931': 'https://doi.org/10.13039/100000166',
|
453
|
+
'https://.org/00apvva27': 'https://doi.org/10.13039/100005716',
|
454
|
+
'https://.org/04nseet23': 'https://doi.org/10.13039/100000179',
|
455
|
+
'https://.org/04k9mqs78': 'https://doi.org/10.13039/100000106',
|
456
|
+
'https://.org/01k638r21': 'https://doi.org/10.13039/100000089',
|
457
|
+
'https://.org/01gmp5538': 'https://doi.org/10.13039/100005447',
|
458
|
+
'https://.org/01vnjbg30': 'https://doi.org/10.13039/100005449',
|
459
|
+
'https://.org/03h7mcc28': 'https://doi.org/10.13039/100000088',
|
460
|
+
'https://.org/05wgkzg12': 'https://doi.org/10.13039/100000169',
|
461
|
+
'https://.org/0445wmv88': 'https://doi.org/10.13039/100000170',
|
462
|
+
'https://.org/02dz2hb46': 'https://doi.org/10.13039/100000077',
|
463
|
+
'https://.org/034m1ez10': 'https://doi.org/10.13039/100000107',
|
464
|
+
'https://.org/02a65dj82': 'https://doi.org/10.13039/100005717',
|
465
|
+
'https://.org/020fhsn68': 'https://doi.org/10.13039/100000001',
|
466
|
+
'https://.org/03z9hh605': 'https://doi.org/10.13039/100000174',
|
467
|
+
'https://.org/04ya3kq71': 'https://doi.org/10.13039/100007521',
|
468
|
+
'https://.org/04evh7y43': 'https://doi.org/10.13039/100005443',
|
469
|
+
'https://.org/04h67aa53': 'https://doi.org/10.13039/100000177',
|
470
|
+
'https://.org/025dabr11': 'https://doi.org/10.13039/100005446',
|
471
|
+
'https://.org/04vw0kz07': 'https://doi.org/10.13039/100005448',
|
472
|
+
'https://.org/054ydxh33': 'https://doi.org/10.13039/100005554',
|
473
|
+
'https://.org/01sharn77': 'https://doi.org/10.13039/100006091',
|
474
|
+
'https://.org/02ch5q898': 'https://doi.org/10.13039/100000001',
|
475
|
+
|
476
|
+
# NASA ID Crosswalk
|
477
|
+
'https://.org/0171mag52': 'https://doi.org/10.13039/100006198',
|
478
|
+
'https://.org/027k65916': 'https://doi.org/10.13039/100006196',
|
479
|
+
'https://.org/027ka1x80': 'https://doi.org/10.13039/100000104',
|
480
|
+
'https://.org/02acart68': 'https://doi.org/10.13039/100006195',
|
481
|
+
'https://.org/059fqnc42': 'https://doi.org/10.13039/100006193',
|
482
|
+
'https://.org/01cyfxe35': 'https://doi.org/10.13039/100016595',
|
483
|
+
'https://.org/04xx4z452': 'https://doi.org/10.13039/100006203',
|
484
|
+
'https://.org/0399mhs52': 'https://doi.org/10.13039/100006199',
|
485
|
+
'https://.org/02epydz83': 'https://doi.org/10.13039/100006197',
|
486
|
+
'https://.org/03j9e2j92': 'https://doi.org/10.13039/100006205',
|
487
|
+
'https://.org/02s42x260': 'https://doi.org/10.13039/100000104',
|
488
|
+
'https://.org/01p7gwa14': 'https://doi.org/10.13039/100000104',
|
489
|
+
'https://.org/01qxmdg18': 'https://doi.org/10.13039/100000104',
|
490
|
+
'https://.org/006ndaj41': 'https://doi.org/10.13039/100000104',
|
491
|
+
'https://.org/03em45j53': 'https://doi.org/10.13039/100007346',
|
492
|
+
'https://.org/045t78n53': 'https://doi.org/10.13039/100000104',
|
493
|
+
'https://.org/00r57r863': 'https://doi.org/10.13039/100000104',
|
494
|
+
'https://.org/0401vze59': 'https://doi.org/10.13039/100007726',
|
495
|
+
'https://.org/04hccab49': 'https://doi.org/10.13039/100000104',
|
496
|
+
'https://.org/04437j066': 'https://doi.org/10.13039/100000104',
|
497
|
+
'https://.org/028b18z22': 'https://doi.org/10.13039/100000104',
|
498
|
+
'https://.org/00ryjtt64': 'https://doi.org/10.13039/100000104',
|
499
|
+
|
500
|
+
# DOE ID Crosswalk
|
501
|
+
'https://ror.org/01bj3aw27': 'https://doi.org/10.13039/100000015',
|
502
|
+
'https://ror.org/03q1rgc19': 'https://doi.org/10.13039/100006133',
|
503
|
+
'https://ror.org/02xznz413': 'https://doi.org/10.13039/100006134',
|
504
|
+
'https://ror.org/03sk1we31': 'https://doi.org/10.13039/100006168',
|
505
|
+
'https://ror.org/00f93gc02': 'https://doi.org/10.13039/100006177',
|
506
|
+
'https://ror.org/05tj7dm33': 'https://doi.org/10.13039/100006147',
|
507
|
+
'https://ror.org/0012c7r22': 'https://doi.org/10.13039/100006192',
|
508
|
+
'https://ror.org/00mmn6b08': 'https://doi.org/10.13039/100006132',
|
509
|
+
'https://ror.org/03ery9d53': 'https://doi.org/10.13039/100006120',
|
510
|
+
'https://ror.org/033jmdj81': 'https://doi.org/10.13039/100000015',
|
511
|
+
'https://ror.org/03rd4h240': 'https://doi.org/10.13039/100006130',
|
512
|
+
'https://ror.org/0054t4769': 'https://doi.org/10.13039/100006200',
|
513
|
+
'https://ror.org/03eecgp81': 'https://doi.org/10.13039/100006174',
|
514
|
+
'https://ror.org/00heb4d89': 'https://doi.org/10.13039/100006135',
|
515
|
+
'https://ror.org/05ek3m339': 'https://doi.org/10.13039/100006150',
|
516
|
+
'https://ror.org/00km40770': 'https://doi.org/10.13039/100006138',
|
517
|
+
'https://ror.org/02ah1da87': 'https://doi.org/10.13039/100006137',
|
518
|
+
'https://ror.org/05hsv7e61': 'https://doi.org/10.13039/100000015',
|
519
|
+
'https://ror.org/01c9ay627': 'https://doi.org/10.13039/100006165',
|
520
|
+
'https://ror.org/04z2gev20': 'https://doi.org/10.13039/100006183',
|
521
|
+
'https://ror.org/02z1qvq09': 'https://doi.org/10.13039/100006144',
|
522
|
+
'https://ror.org/03jf3w726': 'https://doi.org/10.13039/100006186',
|
523
|
+
'https://ror.org/04848jz84': 'https://doi.org/10.13039/100006142',
|
524
|
+
'https://ror.org/04s778r16': 'https://doi.org/10.13039/100006171',
|
525
|
+
'https://ror.org/04nnxen11': 'https://doi.org/10.13039/100000015',
|
526
|
+
'https://ror.org/05csy5p27': 'https://doi.org/10.13039/100010268',
|
527
|
+
'https://ror.org/05efnac71': 'https://doi.org/10.13039/100000015'
|
528
|
+
}.freeze
|
529
|
+
end
|
530
|
+
end
|
531
|
+
# rubocop:enable Metrics/ClassLength
|
data/lib/uc3-dmp-id/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: uc3-dmp-id
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brian Riley
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '3.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: text
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.3'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.3'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: uc3-dmp-dynamo
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -76,6 +90,7 @@ files:
|
|
76
90
|
- README.md
|
77
91
|
- lib/uc3-dmp-id.rb
|
78
92
|
- lib/uc3-dmp-id/asserter.rb
|
93
|
+
- lib/uc3-dmp-id/comparator.rb
|
79
94
|
- lib/uc3-dmp-id/creator.rb
|
80
95
|
- lib/uc3-dmp-id/deleter.rb
|
81
96
|
- lib/uc3-dmp-id/finder.rb
|