uc3-dmp-id 0.1.23 → 0.1.25
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/uc3-dmp-id/comparator.rb +62 -176
- data/lib/uc3-dmp-id/helper.rb +1 -0
- data/lib/uc3-dmp-id/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69c311d2bc8bd7acee827939e1d99c4fdf233ddf5e29386f682ac60f67478a6a
|
4
|
+
data.tar.gz: 829d403c8ada7d01f444494d163bee0fe1c34db320a93bb34abfcc98aebeb122
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8dc439bf6244f758ceb5c1afb5c2d825b58705342b3a146dfcae9af7dcde86352705d703a2aa0e003922f99c61ec2757aaf7c0ea26279c12efdfedbb6cec8b1b
|
7
|
+
data.tar.gz: fd03efac1ba2cacc9be9d40334c835259461a84bb32715e5cbc9723c202d56a52aecdcc8d67ed29a1f050ee0bac853e973955f2211ee41a9da34984a5d13b342
|
@@ -9,22 +9,22 @@ module Uc3DmpId
|
|
9
9
|
# Class that compares incoming data from an external source to the DMP
|
10
10
|
# It determines if they are likely related and applies a confidence rating
|
11
11
|
class Comparator
|
12
|
-
|
12
|
+
MSG_MISSING_DMPS = 'No DMPs were defined. Expected an Array of OpenSearch documents!'
|
13
13
|
|
14
14
|
STOP_WORDS = %w[a an and if of or the then they].freeze
|
15
15
|
|
16
16
|
# See the bottom of this file for a hard-coded crosswalk between Crossref funder ids and ROR ids
|
17
17
|
# Some APIs do not support ROR fully for funder ids, so we need to be able to reference both
|
18
18
|
|
19
|
-
attr_accessor :
|
19
|
+
attr_accessor :dmps, :logger
|
20
20
|
|
21
|
+
# Expecting an Array of OpenSearch documents as :dmps in the :args
|
21
22
|
def initialize(**args)
|
22
23
|
@logger = args[:logger]
|
23
24
|
@details_hash = {}
|
24
25
|
|
25
|
-
@
|
26
|
-
|
27
|
-
raise ComparatorError, MSG_MISSING_DMP if @details_hash.empty?
|
26
|
+
@dmps = args.fetch(:dmps, [])
|
27
|
+
raise ComparatorError, MSG_MISSING_DMPS if @dmps.empty?
|
28
28
|
end
|
29
29
|
|
30
30
|
# Compare the incoming hash with the DMP details that were gathered during initialization.
|
@@ -50,162 +50,58 @@ module Uc3DmpId
|
|
50
50
|
# }
|
51
51
|
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
52
52
|
def compare(hash:)
|
53
|
-
|
54
|
-
return response unless hash.is_a?(Hash) && !hash['title'].nil?
|
53
|
+
return [] unless hash.is_a?(Hash) && !hash['title'].nil?
|
55
54
|
|
56
55
|
# Compare the grant ids. If we have a match return the response immediately since that is
|
57
56
|
# a very positive match!
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
private
|
84
|
-
|
85
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
86
|
-
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
87
|
-
def _extract_dmp_details(dmp:)
|
88
|
-
return nil unless dmp.is_a?(Hash) && !dmp['title'].nil? && !dmp['contact'].nil?
|
89
|
-
|
90
|
-
projects = dmp.fetch('project', [{}])
|
91
|
-
fundings = projects.map { |proj| proj.fetch('funding', []) }.flatten.compact.uniq
|
92
|
-
hosts = dmp.fetch('dataset', []).map { |dset| dset.fetch('distribution', []).map { |d| d['host'] } }
|
93
|
-
people = [dmp['contact']]
|
94
|
-
people << dmp.fetch('contributor', [])
|
95
|
-
|
96
|
-
# Extract all of the important bits about the DMP
|
97
|
-
@details_hash = {
|
98
|
-
created: dmp.fetch('created', Time.now.iso8601),
|
99
|
-
title: _cleanse_text(text: projects&.first&.fetch('title', dmp['title'])),
|
100
|
-
abstract: _cleanse_text(text: projects&.first&.fetch('description', dmp['description'])),
|
101
|
-
keywords: dmp.fetch('dataset', []).map { |ds| ds.fetch('keyword', []) }.flatten.compact.uniq,
|
102
|
-
identifiers: [dmp.fetch('dmp_id', {})['identifier']],
|
103
|
-
last_names: [],
|
104
|
-
orcids: [],
|
105
|
-
affiliation_ids: [],
|
106
|
-
affiliations: [],
|
107
|
-
funder_names: [],
|
108
|
-
funder_ids: [],
|
109
|
-
opportunity_ids: [],
|
110
|
-
grant_ids: [],
|
111
|
-
repositories: []
|
112
|
-
}
|
113
|
-
_extract_people(array: people&.flatten&.compact&.uniq)
|
114
|
-
_extract_funding(array: fundings)
|
115
|
-
_extract_repositories(repos: hosts.flatten.compact.uniq)
|
116
|
-
|
117
|
-
# Clean up the results by flattening and removing duplicates from the Arrays
|
118
|
-
@details_hash.each_key do |key|
|
119
|
-
@details_hash[key] = @details_hash[key].flatten.compact.uniq if @details_hash[key].is_a?(Array)
|
57
|
+
scoring = @dmps.map do |entry|
|
58
|
+
dmp = entry.fetch('_source', {})
|
59
|
+
response = { dmp_id: dmp['_id'], confidence: 'None', score: 0, notes: [] }
|
60
|
+
response = _grants_match?(array: hash['fundings'], dmp:, response:)
|
61
|
+
return response if response[:confidence] != 'None'
|
62
|
+
|
63
|
+
response = _opportunities_match?(array: hash['fundings'], dmp:, response:)
|
64
|
+
response = _orcids_match?(array: hash['people'], dmp:, response:)
|
65
|
+
response = _last_name_and_affiliation_match?(array: hash['people'], dmp:, response:)
|
66
|
+
|
67
|
+
# Only process the following if we had some matching contributors, affiliations or opportuniy nbrs
|
68
|
+
response = _repository_match?(array: hash['repositories'], dmp:, response:) if response[:score].positive?
|
69
|
+
# response = _keyword_match?(array: hash['keywords'], response:) if response[:score].positive?
|
70
|
+
response = _text_match?(type: 'title', text: hash['title'], dmp:, response:) if response[:score].positive?
|
71
|
+
response = _text_match?(type: 'abstract', text: hash['abstract'], dmp:, response:) if response[:score].positive?
|
72
|
+
# If the score is less than 3 then we have no confidence that it is a match
|
73
|
+
return nil if response[:score] <= 2
|
74
|
+
|
75
|
+
# Set the confidence level based on the score
|
76
|
+
response[:confidence] = if response[:score] > 10
|
77
|
+
'High'
|
78
|
+
else
|
79
|
+
(response[:score] > 5 ? 'Medium' : 'Low')
|
80
|
+
end
|
81
|
+
response
|
120
82
|
end
|
121
|
-
@logger&.debug(message: 'Extracted the following from the DMP', details: @details_hash)
|
122
|
-
end
|
123
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
124
|
-
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
125
83
|
|
126
|
-
|
127
|
-
|
128
|
-
def _extract_funding(array:)
|
129
|
-
return [] unless array.is_a?(Array)
|
130
|
-
|
131
|
-
array.each do |funding|
|
132
|
-
next unless funding.is_a?(Hash)
|
133
|
-
|
134
|
-
funder_id = funding.fetch('funder_id', {})
|
135
|
-
ror = funder_id['identifier'] if funder_id['type']&.downcase&.strip == 'ror'
|
136
|
-
fundref = ror.nil? ? funder_id['identifier']&.downcase&.strip : ROR_FUNDREF_ID_CROSSWALK[:"#{ror}"]
|
137
|
-
opportunity = funding.fetch('dmproadmap_funding_opportunity_id', {})['identifier']
|
138
|
-
grant = funding.fetch('grant_id', {})['identifier']
|
139
|
-
|
140
|
-
@details_hash[:identifiers] << ror&.downcase&.strip
|
141
|
-
@details_hash[:identifiers] << fundref&.downcase&.strip
|
142
|
-
@details_hash[:identifiers] << grant&.downcase&.strip
|
143
|
-
@details_hash[:identifiers] << grant&.split('/')&.last&.downcase&.strip
|
144
|
-
@details_hash[:identifiers] << opportunity&.downcase&.strip
|
145
|
-
|
146
|
-
@details_hash[:funder_names] << funding['name']&.downcase&.split(' (').first&.strip
|
147
|
-
@details_hash[:funder_ids] << fundref
|
148
|
-
@details_hash[:opportunity_ids] << opportunity&.downcase&.strip
|
149
|
-
@details_hash[:grant_ids] << [grant&.downcase&.strip, grant&.split('/')&.last&.downcase&.strip]
|
150
|
-
end
|
151
|
-
array
|
84
|
+
# TODO: introduce a tie-breaker here (maybe the closes to the project_end date)
|
85
|
+
scoring.compact.sort { |a, b| b[:score] <=> a[:score] }&.first
|
152
86
|
end
|
153
87
|
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
154
88
|
|
155
|
-
|
156
|
-
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
157
|
-
def _extract_people(array:)
|
158
|
-
return [] unless array.is_a?(Array)
|
159
|
-
|
160
|
-
array.each do |entry|
|
161
|
-
next unless entry.is_a?(Hash)
|
162
|
-
|
163
|
-
id = entry.fetch('contributor_id', entry.fetch('contact_id', {}))['identifier']&.downcase&.strip
|
164
|
-
affil = entry.fetch('dmproadmap_affiliation', {})
|
165
|
-
ror = affil.fetch('affiliation_id', {})['identifier']&.downcase&.strip
|
166
|
-
name = entry.fetch('name', '')&.downcase&.strip
|
167
|
-
last_name = name.include?(', ') ? name.split(', ').first : name.split.last
|
168
|
-
|
169
|
-
@details_hash[:orcids] << id unless id.nil?
|
170
|
-
@details_hash[:identifiers] << [id, ror&.downcase&.strip]
|
171
|
-
@details_hash[:last_names] << last_name
|
172
|
-
@details_hash[:affiliation_ids] << ror
|
173
|
-
@details_hash[:affiliations] << affil.fetch('name', '')&.split(' (')&.first&.downcase&.strip
|
174
|
-
end
|
175
|
-
array
|
176
|
-
end
|
177
|
-
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
178
|
-
|
179
|
-
# Extract all of the re3data ids, URLs and names
|
180
|
-
# rubocop:disable Metrics/AbcSize
|
181
|
-
def _extract_repositories(repos:)
|
182
|
-
return [] unless repos.is_a?(Array)
|
183
|
-
|
184
|
-
repos.each do |repo|
|
185
|
-
next unless repo.is_a?(Hash)
|
186
|
-
|
187
|
-
@details_hash[:identifiers] << [
|
188
|
-
repo['url']&.downcase&.strip, repo.fetch('dmproadmap_host_id', {})['identifier']&.downcase&.strip
|
189
|
-
]
|
190
|
-
@details_hash[:repositories] << repo.fetch('name', '')&.downcase&.strip
|
191
|
-
end
|
192
|
-
repos
|
193
|
-
end
|
194
|
-
# rubocop:enable Metrics/AbcSize
|
89
|
+
private
|
195
90
|
|
196
91
|
# Returns whether or not the incoming grant id(s) match the DMPs grant id. Expecting:
|
197
92
|
# [
|
198
93
|
# { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
|
199
94
|
# ]
|
200
95
|
# rubocop:disable Metrics/AbcSize
|
201
|
-
def _grants_match?(array:, response:)
|
202
|
-
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
96
|
+
def _grants_match?(array:, dmp:, response:)
|
97
|
+
return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
|
98
|
+
return response unless dmp['grant_ids'].is_a?(Array) && !dmp['grant_ids'].empty?
|
203
99
|
|
204
100
|
ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
|
205
101
|
.map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
|
206
102
|
.flatten.compact.uniq
|
207
103
|
|
208
|
-
matched = _compare_arrays(array_a:
|
104
|
+
matched = _compare_arrays(array_a: dmp['grant_ids'], array_b: ids)
|
209
105
|
return response if matched <= 0
|
210
106
|
|
211
107
|
response[:confidence] = 'Absolute'
|
@@ -220,14 +116,15 @@ module Uc3DmpId
|
|
220
116
|
# { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
|
221
117
|
# ]
|
222
118
|
# rubocop:disable Metrics/AbcSize
|
223
|
-
def _opportunities_match?(array:, response:)
|
224
|
-
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
119
|
+
def _opportunities_match?(array:, dmp:, response:)
|
120
|
+
return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
|
121
|
+
return response unless dmp['funder_opportunity_ids'].is_a?(Array) && !dmp['funder_opportunity_ids'].empty?
|
225
122
|
|
226
123
|
ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
|
227
124
|
.map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
|
228
125
|
.flatten.compact.uniq
|
229
126
|
|
230
|
-
matched = _compare_arrays(array_a:
|
127
|
+
matched = _compare_arrays(array_a: dmp['funder_opportunity_ids'], array_b: ids)
|
231
128
|
return response if matched <= 0
|
232
129
|
|
233
130
|
response[:score] += 5
|
@@ -245,14 +142,15 @@ module Uc3DmpId
|
|
245
142
|
# }
|
246
143
|
# ]
|
247
144
|
# rubocop:disable Metrics/AbcSize
|
248
|
-
def _orcids_match?(array:, response:)
|
249
|
-
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
145
|
+
def _orcids_match?(array:, dmp:, response:)
|
146
|
+
return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
|
147
|
+
return response unless dmp['people_ids'].is_a?(Array) && !dmp['people_ids'].empty?
|
250
148
|
|
251
149
|
ids = array.select { |repo| repo.is_a?(Hash) }
|
252
150
|
.map { |person| person['id']&.downcase&.strip }
|
253
151
|
.flatten.compact.uniq
|
254
152
|
|
255
|
-
matched = _compare_arrays(array_a:
|
153
|
+
matched = _compare_arrays(array_a: dmp['people_ids'], array_b: ids)
|
256
154
|
return response if matched <= 0
|
257
155
|
|
258
156
|
response[:score] += (matched * 2)
|
@@ -270,8 +168,9 @@ module Uc3DmpId
|
|
270
168
|
# }
|
271
169
|
# ]
|
272
170
|
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
273
|
-
def _last_name_and_affiliation_match?(array:, response:)
|
274
|
-
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
171
|
+
def _last_name_and_affiliation_match?(array:, dmp:, response:)
|
172
|
+
return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
|
173
|
+
return response unless dmp['people'].is_a?(Array) && !dmp['people'].empty?
|
275
174
|
|
276
175
|
array = array.select { |repo| repo.is_a?(Hash) }
|
277
176
|
affiliations = array.map { |person| person['affiliation'] }&.flatten&.compact&.uniq
|
@@ -280,10 +179,10 @@ module Uc3DmpId
|
|
280
179
|
affil_names = affiliations.map { |affil| affil['name']&.downcase&.strip }&.flatten&.compact&.uniq
|
281
180
|
|
282
181
|
# Check the person last names and affiliation name and RORs
|
283
|
-
last_names_matched = _compare_arrays(array_a:
|
284
|
-
rors_matched = _compare_arrays(array_a:
|
285
|
-
affil_names_matched = _compare_arrays(array_a:
|
286
|
-
return response if last_names_matched <= 0
|
182
|
+
last_names_matched = _compare_arrays(array_a: dmp['people'], array_b: last_names)
|
183
|
+
rors_matched = _compare_arrays(array_a: dmp.fetch('affiliation_ids', []), array_b: rors)
|
184
|
+
affil_names_matched = _compare_arrays(array_a: dmp.fetch('affiliations', []), array_b: affil_names)
|
185
|
+
return response if last_names_matched <= 0 && rors_matched <= 0 && affil_names_matched <= 0
|
287
186
|
|
288
187
|
response[:score] += last_names_matched + rors_matched + affil_names_matched
|
289
188
|
response[:notes] << 'contributor names and affiliations matched'
|
@@ -296,15 +195,16 @@ module Uc3DmpId
|
|
296
195
|
# { id: ["http://some.repo.org", "https://doi.org/re3data123"], name: "Repo" }
|
297
196
|
# ]
|
298
197
|
# rubocop:disable Metrics/AbcSize
|
299
|
-
def _repository_match?(array:, response:)
|
300
|
-
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
198
|
+
def _repository_match?(array:, dmp:, response:)
|
199
|
+
return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
|
200
|
+
return response unless dmp['repositories'].is_a?(Array) && !dmp['repositories'].empty?
|
301
201
|
|
302
202
|
# We only care about repositories with ids/urls
|
303
203
|
ids = array.select { |repo| repo.is_a?(Hash) }
|
304
204
|
.map { |repo| repo['id'].map { |id| id&.downcase&.strip } }
|
305
205
|
.flatten.compact.uniq
|
306
206
|
|
307
|
-
matched = _compare_arrays(array_a:
|
207
|
+
matched = _compare_arrays(array_a: dmp['repositories'], array_b: ids)
|
308
208
|
return response if matched <= 0
|
309
209
|
|
310
210
|
response[:score] += matched
|
@@ -313,33 +213,19 @@ module Uc3DmpId
|
|
313
213
|
end
|
314
214
|
# rubocop:enable Metrics/AbcSize
|
315
215
|
|
316
|
-
# Returns whether or not the list of keywords exist in the DMP. Expecting:
|
317
|
-
# keywords: ["foo", "bar"]
|
318
|
-
def _keyword_match?(array:, response:)
|
319
|
-
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
320
|
-
|
321
|
-
keywords = array.map { |word| word&.downcase&.strip }&.flatten&.compact&.uniq
|
322
|
-
matched = _compare_arrays(array_a: @details_hash.fetch(:keywords, []), array_b: keywords)
|
323
|
-
return response if matched <= 0
|
324
|
-
|
325
|
-
response[:score] += 1
|
326
|
-
response[:notes] << 'keywords matched'
|
327
|
-
response
|
328
|
-
end
|
329
|
-
|
330
216
|
# Uses an NLP library to determine if the :text matches the DMP/Project :title or :description
|
331
217
|
# rubocop:disable Metrics/AbcSize
|
332
|
-
def _text_match?(text:, response:, type: 'title')
|
333
|
-
return response unless response.is_a?(Hash) && text.is_a?(String) && !text.strip.empty? &&
|
334
|
-
!@details_hash[type.to_sym].nil?
|
218
|
+
def _text_match?(text:, dmp:, response:, type: 'title')
|
219
|
+
return response unless response.is_a?(Hash) && text.is_a?(String) && !text.strip.empty? && dmp.is_a?(Hash)
|
335
220
|
|
336
221
|
nlp_processor = Text::WhiteSimilarity.new
|
337
222
|
cleansed = _cleanse_text(text:)
|
338
223
|
|
224
|
+
dmp_val = type == 'title' ? dmp['title'] : dmp['description']
|
339
225
|
details = {
|
340
|
-
"dmp_#{type}":
|
226
|
+
"dmp_#{type}": dmp_val,
|
341
227
|
"incoming_#{type}": cleansed,
|
342
|
-
nlp_score: nlp_processor.similarity(
|
228
|
+
nlp_score: nlp_processor.similarity(dmp_val, cleansed)
|
343
229
|
}
|
344
230
|
@logger&.debug(message: 'Text::WhiteSimilarity score', details:)
|
345
231
|
return response if details[:nlp_score] < 0.5
|
data/lib/uc3-dmp-id/helper.rb
CHANGED
@@ -206,6 +206,7 @@ module Uc3DmpId
|
|
206
206
|
annotated['dmphub_modification_day'] = Time.now.utc.strftime('%Y-%m-%d')
|
207
207
|
annotated['dmphub_owner_id'] = owner_id unless owner_id.nil?
|
208
208
|
annotated['dmphub_owner_org'] = owner_org unless owner_org.nil?
|
209
|
+
annotated['registered'] = annotated['modified'] if annotated['registered'].nil?
|
209
210
|
return annotated unless json['dmphub_provenance_id'].nil?
|
210
211
|
|
211
212
|
annotated['dmphub_provenance_id'] = provenance.fetch('PK', '')
|
data/lib/uc3-dmp-id/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: uc3-dmp-id
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.25
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brian Riley
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-03-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|