uc3-dmp-id 0.1.24 → 0.1.25

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 91c01361a882520a281db0da8aa029bcb480b4873b684a4a526f2542abda9ea0
4
- data.tar.gz: b9ca235a0c706469b4568e74a6ecbec2b39e0b44f9298d781ea11a2dd9cde749
3
+ metadata.gz: 69c311d2bc8bd7acee827939e1d99c4fdf233ddf5e29386f682ac60f67478a6a
4
+ data.tar.gz: 829d403c8ada7d01f444494d163bee0fe1c34db320a93bb34abfcc98aebeb122
5
5
  SHA512:
6
- metadata.gz: d4a5092b448ce3f1a1ad71ed53c5d81304780732a5766d35a2875d0ed244d07b660d25f156a8facd1c9a794d89e59f17915bf640692ba1266556892245b5cab9
7
- data.tar.gz: 4e3d0e476d2479f075abd8bc0cb45028766a766e60a1fae4441a1ef68c451e58757c5cd33c4b3033ccce5784c23112b1f9cb9782b49fd17c8dd2b80f538a4d24
6
+ metadata.gz: 8dc439bf6244f758ceb5c1afb5c2d825b58705342b3a146dfcae9af7dcde86352705d703a2aa0e003922f99c61ec2757aaf7c0ea26279c12efdfedbb6cec8b1b
7
+ data.tar.gz: fd03efac1ba2cacc9be9d40334c835259461a84bb32715e5cbc9723c202d56a52aecdcc8d67ed29a1f050ee0bac853e973955f2211ee41a9da34984a5d13b342
@@ -9,22 +9,22 @@ module Uc3DmpId
9
9
  # Class that compares incoming data from an external source to the DMP
10
10
  # It determines if they are likely related and applies a confidence rating
11
11
  class Comparator
12
- MSG_MISSING_DMP = 'No DMP or the DMP did not contain enough information to use.'
12
+ MSG_MISSING_DMPS = 'No DMPs were defined. Expected an Array of OpenSearch documents!'
13
13
 
14
14
  STOP_WORDS = %w[a an and if of or the then they].freeze
15
15
 
16
16
  # See the bottom of this file for a hard-coded crosswalk between Crossref funder ids and ROR ids
17
17
  # Some APIs do not support ROR fully for funder ids, so we need to be able to reference both
18
18
 
19
- attr_accessor :dmp, :details_hash, :logger
19
+ attr_accessor :dmps, :logger
20
20
 
21
+ # Expecting an Array of OpenSearch documents as :dmps in the :args
21
22
  def initialize(**args)
22
23
  @logger = args[:logger]
23
24
  @details_hash = {}
24
25
 
25
- @dmp = args.fetch(:dmp, {})['dmp'].nil? ? args[:dmp] : args.fetch(:dmp, {})['dmp']
26
- _extract_dmp_details(dmp:)
27
- raise ComparatorError, MSG_MISSING_DMP if @details_hash.empty?
26
+ @dmps = args.fetch(:dmps, [])
27
+ raise ComparatorError, MSG_MISSING_DMPS if @dmps.empty?
28
28
  end
29
29
 
30
30
  # Compare the incoming hash with the DMP details that were gathered during initialization.
@@ -50,162 +50,58 @@ module Uc3DmpId
50
50
  # }
51
51
  # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
52
52
  def compare(hash:)
53
- response = { confidence: 'None', score: 0, notes: [] }
54
- return response unless hash.is_a?(Hash) && !hash['title'].nil?
53
+ return [] unless hash.is_a?(Hash) && !hash['title'].nil?
55
54
 
56
55
  # Compare the grant ids. If we have a match return the response immediately since that is
57
56
  # a very positive match!
58
- response = _grants_match?(array: hash['fundings'], response:)
59
- return response if response[:confidence] != 'None'
60
-
61
- response = _opportunities_match?(array: hash['fundings'], response:)
62
- response = _orcids_match?(array: hash['people'], response:)
63
- response = _last_name_and_affiliation_match?(array: hash['people'], response:)
64
-
65
- # Only process the following if we had some matching contributors, affiliations or opportuniy nbrs
66
- response = _repository_match?(array: hash['repositories'], response:) if response[:score].positive?
67
- response = _keyword_match?(array: hash['keywords'], response:) if response[:score].positive?
68
- response = _text_match?(type: 'title', text: hash['title'], response:) if response[:score].positive?
69
- response = _text_match?(type: 'abstract', text: hash['abstract'], response:) if response[:score].positive?
70
- # If the score is less than 3 then we have no confidence that it is a match
71
- return response if response[:score] <= 2
72
-
73
- # Set the confidence level based on the score
74
- response[:confidence] = if response[:score] > 10
75
- 'High'
76
- else
77
- (response[:score] > 5 ? 'Medium' : 'Low')
78
- end
79
- response
80
- end
81
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
82
-
83
- private
84
-
85
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
86
- # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
87
- def _extract_dmp_details(dmp:)
88
- return nil unless dmp.is_a?(Hash) && !dmp['title'].nil? && !dmp['contact'].nil?
89
-
90
- projects = dmp.fetch('project', [{}])
91
- fundings = projects.map { |proj| proj.fetch('funding', []) }.flatten.compact.uniq
92
- hosts = dmp.fetch('dataset', []).map { |dset| dset.fetch('distribution', []).map { |d| d['host'] } }
93
- people = [dmp['contact']]
94
- people << dmp.fetch('contributor', [])
95
-
96
- # Extract all of the important bits about the DMP
97
- @details_hash = {
98
- created: dmp.fetch('created', Time.now.iso8601),
99
- title: _cleanse_text(text: projects&.first&.fetch('title', dmp['title'])),
100
- abstract: _cleanse_text(text: projects&.first&.fetch('description', dmp['description'])),
101
- keywords: dmp.fetch('dataset', []).map { |ds| ds.fetch('keyword', []) }.flatten.compact.uniq,
102
- identifiers: [dmp.fetch('dmp_id', {})['identifier']],
103
- last_names: [],
104
- orcids: [],
105
- affiliation_ids: [],
106
- affiliations: [],
107
- funder_names: [],
108
- funder_ids: [],
109
- opportunity_ids: [],
110
- grant_ids: [],
111
- repositories: []
112
- }
113
- _extract_people(array: people&.flatten&.compact&.uniq)
114
- _extract_funding(array: fundings)
115
- _extract_repositories(repos: hosts.flatten.compact.uniq)
116
-
117
- # Clean up the results by flattening and removing duplicates from the Arrays
118
- @details_hash.each_key do |key|
119
- @details_hash[key] = @details_hash[key].flatten.compact.uniq if @details_hash[key].is_a?(Array)
57
+ scoring = @dmps.map do |entry|
58
+ dmp = entry.fetch('_source', {})
59
+ response = { dmp_id: dmp['_id'], confidence: 'None', score: 0, notes: [] }
60
+ response = _grants_match?(array: hash['fundings'], dmp:, response:)
61
+ return response if response[:confidence] != 'None'
62
+
63
+ response = _opportunities_match?(array: hash['fundings'], dmp:, response:)
64
+ response = _orcids_match?(array: hash['people'], dmp:, response:)
65
+ response = _last_name_and_affiliation_match?(array: hash['people'], dmp:, response:)
66
+
67
+ # Only process the following if we had some matching contributors, affiliations or opportuniy nbrs
68
+ response = _repository_match?(array: hash['repositories'], dmp:, response:) if response[:score].positive?
69
+ # response = _keyword_match?(array: hash['keywords'], response:) if response[:score].positive?
70
+ response = _text_match?(type: 'title', text: hash['title'], dmp:, response:) if response[:score].positive?
71
+ response = _text_match?(type: 'abstract', text: hash['abstract'], dmp:, response:) if response[:score].positive?
72
+ # If the score is less than 3 then we have no confidence that it is a match
73
+ return nil if response[:score] <= 2
74
+
75
+ # Set the confidence level based on the score
76
+ response[:confidence] = if response[:score] > 10
77
+ 'High'
78
+ else
79
+ (response[:score] > 5 ? 'Medium' : 'Low')
80
+ end
81
+ response
120
82
  end
121
- @logger&.debug(message: 'Extracted the following from the DMP', details: @details_hash)
122
- end
123
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
124
- # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
125
83
 
126
- # Extract all of the funding information
127
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
128
- def _extract_funding(array:)
129
- return [] unless array.is_a?(Array)
130
-
131
- array.each do |funding|
132
- next unless funding.is_a?(Hash)
133
-
134
- funder_id = funding.fetch('funder_id', {})
135
- ror = funder_id['identifier'] if funder_id['type']&.downcase&.strip == 'ror'
136
- fundref = ror.nil? ? funder_id['identifier']&.downcase&.strip : ROR_FUNDREF_ID_CROSSWALK[:"#{ror}"]
137
- opportunity = funding.fetch('dmproadmap_funding_opportunity_id', {})['identifier']
138
- grant = funding.fetch('grant_id', {})['identifier']
139
-
140
- @details_hash[:identifiers] << ror&.downcase&.strip
141
- @details_hash[:identifiers] << fundref&.downcase&.strip
142
- @details_hash[:identifiers] << grant&.downcase&.strip
143
- @details_hash[:identifiers] << grant&.split('/')&.last&.downcase&.strip
144
- @details_hash[:identifiers] << opportunity&.downcase&.strip
145
-
146
- @details_hash[:funder_names] << funding['name']&.downcase&.split(' (').first&.strip
147
- @details_hash[:funder_ids] << fundref
148
- @details_hash[:opportunity_ids] << opportunity&.downcase&.strip
149
- @details_hash[:grant_ids] << [grant&.downcase&.strip, grant&.split('/')&.last&.downcase&.strip]
150
- end
151
- array
84
+ # TODO: introduce a tie-breaker here (maybe the closes to the project_end date)
85
+ scoring.compact.sort { |a, b| b[:score] <=> a[:score] }&.first
152
86
  end
153
87
  # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
154
88
 
155
- # Extract all of the ORCIDs, last names, and affiliation ids and names
156
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
157
- def _extract_people(array:)
158
- return [] unless array.is_a?(Array)
159
-
160
- array.each do |entry|
161
- next unless entry.is_a?(Hash)
162
-
163
- id = entry.fetch('contributor_id', entry.fetch('contact_id', {}))['identifier']&.downcase&.strip
164
- affil = entry.fetch('dmproadmap_affiliation', {})
165
- ror = affil.fetch('affiliation_id', {})['identifier']&.downcase&.strip
166
- name = entry.fetch('name', '')&.downcase&.strip
167
- last_name = name.include?(', ') ? name.split(', ').first : name.split.last
168
-
169
- @details_hash[:orcids] << id unless id.nil?
170
- @details_hash[:identifiers] << [id, ror&.downcase&.strip]
171
- @details_hash[:last_names] << last_name
172
- @details_hash[:affiliation_ids] << ror
173
- @details_hash[:affiliations] << affil.fetch('name', '')&.split(' (')&.first&.downcase&.strip
174
- end
175
- array
176
- end
177
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
178
-
179
- # Extract all of the re3data ids, URLs and names
180
- # rubocop:disable Metrics/AbcSize
181
- def _extract_repositories(repos:)
182
- return [] unless repos.is_a?(Array)
183
-
184
- repos.each do |repo|
185
- next unless repo.is_a?(Hash)
186
-
187
- @details_hash[:identifiers] << [
188
- repo['url']&.downcase&.strip, repo.fetch('dmproadmap_host_id', {})['identifier']&.downcase&.strip
189
- ]
190
- @details_hash[:repositories] << repo.fetch('name', '')&.downcase&.strip
191
- end
192
- repos
193
- end
194
- # rubocop:enable Metrics/AbcSize
89
+ private
195
90
 
196
91
  # Returns whether or not the incoming grant id(s) match the DMPs grant id. Expecting:
197
92
  # [
198
93
  # { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
199
94
  # ]
200
95
  # rubocop:disable Metrics/AbcSize
201
- def _grants_match?(array:, response:)
202
- return response unless array.is_a?(Array) && response.is_a?(Hash)
96
+ def _grants_match?(array:, dmp:, response:)
97
+ return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
98
+ return response unless dmp['grant_ids'].is_a?(Array) && !dmp['grant_ids'].empty?
203
99
 
204
100
  ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
205
101
  .map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
206
102
  .flatten.compact.uniq
207
103
 
208
- matched = _compare_arrays(array_a: @details_hash.fetch(:grant_ids, []), array_b: ids)
104
+ matched = _compare_arrays(array_a: dmp['grant_ids'], array_b: ids)
209
105
  return response if matched <= 0
210
106
 
211
107
  response[:confidence] = 'Absolute'
@@ -220,14 +116,15 @@ module Uc3DmpId
220
116
  # { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
221
117
  # ]
222
118
  # rubocop:disable Metrics/AbcSize
223
- def _opportunities_match?(array:, response:)
224
- return response unless array.is_a?(Array) && response.is_a?(Hash)
119
+ def _opportunities_match?(array:, dmp:, response:)
120
+ return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
121
+ return response unless dmp['funder_opportunity_ids'].is_a?(Array) && !dmp['funder_opportunity_ids'].empty?
225
122
 
226
123
  ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
227
124
  .map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
228
125
  .flatten.compact.uniq
229
126
 
230
- matched = _compare_arrays(array_a: @details_hash.fetch(:opportunity_ids, []), array_b: ids)
127
+ matched = _compare_arrays(array_a: dmp['funder_opportunity_ids'], array_b: ids)
231
128
  return response if matched <= 0
232
129
 
233
130
  response[:score] += 5
@@ -245,14 +142,15 @@ module Uc3DmpId
245
142
  # }
246
143
  # ]
247
144
  # rubocop:disable Metrics/AbcSize
248
- def _orcids_match?(array:, response:)
249
- return response unless array.is_a?(Array) && response.is_a?(Hash)
145
+ def _orcids_match?(array:, dmp:, response:)
146
+ return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
147
+ return response unless dmp['people_ids'].is_a?(Array) && !dmp['people_ids'].empty?
250
148
 
251
149
  ids = array.select { |repo| repo.is_a?(Hash) }
252
150
  .map { |person| person['id']&.downcase&.strip }
253
151
  .flatten.compact.uniq
254
152
 
255
- matched = _compare_arrays(array_a: @details_hash.fetch(:identifiers, []), array_b: ids)
153
+ matched = _compare_arrays(array_a: dmp['people_ids'], array_b: ids)
256
154
  return response if matched <= 0
257
155
 
258
156
  response[:score] += (matched * 2)
@@ -270,8 +168,9 @@ module Uc3DmpId
270
168
  # }
271
169
  # ]
272
170
  # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
273
- def _last_name_and_affiliation_match?(array:, response:)
274
- return response unless array.is_a?(Array) && response.is_a?(Hash)
171
+ def _last_name_and_affiliation_match?(array:, dmp:, response:)
172
+ return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
173
+ return response unless dmp['people'].is_a?(Array) && !dmp['people'].empty?
275
174
 
276
175
  array = array.select { |repo| repo.is_a?(Hash) }
277
176
  affiliations = array.map { |person| person['affiliation'] }&.flatten&.compact&.uniq
@@ -280,10 +179,10 @@ module Uc3DmpId
280
179
  affil_names = affiliations.map { |affil| affil['name']&.downcase&.strip }&.flatten&.compact&.uniq
281
180
 
282
181
  # Check the person last names and affiliation name and RORs
283
- last_names_matched = _compare_arrays(array_a: @details_hash.fetch(:last_names, []), array_b: last_names)
284
- rors_matched = _compare_arrays(array_a: @details_hash.fetch(:affiliation_ids, []), array_b: rors)
285
- affil_names_matched = _compare_arrays(array_a: @details_hash.fetch(:affiliations, []), array_b: affil_names)
286
- return response if last_names_matched <= 0
182
+ last_names_matched = _compare_arrays(array_a: dmp['people'], array_b: last_names)
183
+ rors_matched = _compare_arrays(array_a: dmp.fetch('affiliation_ids', []), array_b: rors)
184
+ affil_names_matched = _compare_arrays(array_a: dmp.fetch('affiliations', []), array_b: affil_names)
185
+ return response if last_names_matched <= 0 && rors_matched <= 0 && affil_names_matched <= 0
287
186
 
288
187
  response[:score] += last_names_matched + rors_matched + affil_names_matched
289
188
  response[:notes] << 'contributor names and affiliations matched'
@@ -296,15 +195,16 @@ module Uc3DmpId
296
195
  # { id: ["http://some.repo.org", "https://doi.org/re3data123"], name: "Repo" }
297
196
  # ]
298
197
  # rubocop:disable Metrics/AbcSize
299
- def _repository_match?(array:, response:)
300
- return response unless array.is_a?(Array) && response.is_a?(Hash)
198
+ def _repository_match?(array:, dmp:, response:)
199
+ return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
200
+ return response unless dmp['repositories'].is_a?(Array) && !dmp['repositories'].empty?
301
201
 
302
202
  # We only care about repositories with ids/urls
303
203
  ids = array.select { |repo| repo.is_a?(Hash) }
304
204
  .map { |repo| repo['id'].map { |id| id&.downcase&.strip } }
305
205
  .flatten.compact.uniq
306
206
 
307
- matched = _compare_arrays(array_a: @details_hash.fetch(:identifiers, []), array_b: ids)
207
+ matched = _compare_arrays(array_a: dmp['repositories'], array_b: ids)
308
208
  return response if matched <= 0
309
209
 
310
210
  response[:score] += matched
@@ -313,33 +213,19 @@ module Uc3DmpId
313
213
  end
314
214
  # rubocop:enable Metrics/AbcSize
315
215
 
316
- # Returns whether or not the list of keywords exist in the DMP. Expecting:
317
- # keywords: ["foo", "bar"]
318
- def _keyword_match?(array:, response:)
319
- return response unless array.is_a?(Array) && response.is_a?(Hash)
320
-
321
- keywords = array.map { |word| word&.downcase&.strip }&.flatten&.compact&.uniq
322
- matched = _compare_arrays(array_a: @details_hash.fetch(:keywords, []), array_b: keywords)
323
- return response if matched <= 0
324
-
325
- response[:score] += 1
326
- response[:notes] << 'keywords matched'
327
- response
328
- end
329
-
330
216
  # Uses an NLP library to determine if the :text matches the DMP/Project :title or :description
331
217
  # rubocop:disable Metrics/AbcSize
332
- def _text_match?(text:, response:, type: 'title')
333
- return response unless response.is_a?(Hash) && text.is_a?(String) && !text.strip.empty? &&
334
- !@details_hash[type.to_sym].nil?
218
+ def _text_match?(text:, dmp:, response:, type: 'title')
219
+ return response unless response.is_a?(Hash) && text.is_a?(String) && !text.strip.empty? && dmp.is_a?(Hash)
335
220
 
336
221
  nlp_processor = Text::WhiteSimilarity.new
337
222
  cleansed = _cleanse_text(text:)
338
223
 
224
+ dmp_val = type == 'title' ? dmp['title'] : dmp['description']
339
225
  details = {
340
- "dmp_#{type}": @details_hash[type.to_sym],
226
+ "dmp_#{type}": dmp_val,
341
227
  "incoming_#{type}": cleansed,
342
- nlp_score: nlp_processor.similarity(@details_hash[type.to_sym], cleansed)
228
+ nlp_score: nlp_processor.similarity(dmp_val, cleansed)
343
229
  }
344
230
  @logger&.debug(message: 'Text::WhiteSimilarity score', details:)
345
231
  return response if details[:nlp_score] < 0.5
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Uc3DmpId
4
- VERSION = '0.1.24'
4
+ VERSION = '0.1.25'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: uc3-dmp-id
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.24
4
+ version: 0.1.25
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brian Riley
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-03-08 00:00:00.000000000 Z
11
+ date: 2024-03-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json