uc3-dmp-id 0.1.24 → 0.1.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 91c01361a882520a281db0da8aa029bcb480b4873b684a4a526f2542abda9ea0
4
- data.tar.gz: b9ca235a0c706469b4568e74a6ecbec2b39e0b44f9298d781ea11a2dd9cde749
3
+ metadata.gz: 69c311d2bc8bd7acee827939e1d99c4fdf233ddf5e29386f682ac60f67478a6a
4
+ data.tar.gz: 829d403c8ada7d01f444494d163bee0fe1c34db320a93bb34abfcc98aebeb122
5
5
  SHA512:
6
- metadata.gz: d4a5092b448ce3f1a1ad71ed53c5d81304780732a5766d35a2875d0ed244d07b660d25f156a8facd1c9a794d89e59f17915bf640692ba1266556892245b5cab9
7
- data.tar.gz: 4e3d0e476d2479f075abd8bc0cb45028766a766e60a1fae4441a1ef68c451e58757c5cd33c4b3033ccce5784c23112b1f9cb9782b49fd17c8dd2b80f538a4d24
6
+ metadata.gz: 8dc439bf6244f758ceb5c1afb5c2d825b58705342b3a146dfcae9af7dcde86352705d703a2aa0e003922f99c61ec2757aaf7c0ea26279c12efdfedbb6cec8b1b
7
+ data.tar.gz: fd03efac1ba2cacc9be9d40334c835259461a84bb32715e5cbc9723c202d56a52aecdcc8d67ed29a1f050ee0bac853e973955f2211ee41a9da34984a5d13b342
@@ -9,22 +9,22 @@ module Uc3DmpId
9
9
  # Class that compares incoming data from an external source to the DMP
10
10
  # It determines if they are likely related and applies a confidence rating
11
11
  class Comparator
12
- MSG_MISSING_DMP = 'No DMP or the DMP did not contain enough information to use.'
12
+ MSG_MISSING_DMPS = 'No DMPs were defined. Expected an Array of OpenSearch documents!'
13
13
 
14
14
  STOP_WORDS = %w[a an and if of or the then they].freeze
15
15
 
16
16
  # See the bottom of this file for a hard-coded crosswalk between Crossref funder ids and ROR ids
17
17
  # Some APIs do not support ROR fully for funder ids, so we need to be able to reference both
18
18
 
19
- attr_accessor :dmp, :details_hash, :logger
19
+ attr_accessor :dmps, :logger
20
20
 
21
+ # Expecting an Array of OpenSearch documents as :dmps in the :args
21
22
  def initialize(**args)
22
23
  @logger = args[:logger]
23
24
  @details_hash = {}
24
25
 
25
- @dmp = args.fetch(:dmp, {})['dmp'].nil? ? args[:dmp] : args.fetch(:dmp, {})['dmp']
26
- _extract_dmp_details(dmp:)
27
- raise ComparatorError, MSG_MISSING_DMP if @details_hash.empty?
26
+ @dmps = args.fetch(:dmps, [])
27
+ raise ComparatorError, MSG_MISSING_DMPS if @dmps.empty?
28
28
  end
29
29
 
30
30
  # Compare the incoming hash with the DMP details that were gathered during initialization.
@@ -50,162 +50,58 @@ module Uc3DmpId
50
50
  # }
51
51
  # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
52
52
  def compare(hash:)
53
- response = { confidence: 'None', score: 0, notes: [] }
54
- return response unless hash.is_a?(Hash) && !hash['title'].nil?
53
+ return [] unless hash.is_a?(Hash) && !hash['title'].nil?
55
54
 
56
55
  # Compare the grant ids. If we have a match return the response immediately since that is
57
56
  # a very positive match!
58
- response = _grants_match?(array: hash['fundings'], response:)
59
- return response if response[:confidence] != 'None'
60
-
61
- response = _opportunities_match?(array: hash['fundings'], response:)
62
- response = _orcids_match?(array: hash['people'], response:)
63
- response = _last_name_and_affiliation_match?(array: hash['people'], response:)
64
-
65
- # Only process the following if we had some matching contributors, affiliations or opportuniy nbrs
66
- response = _repository_match?(array: hash['repositories'], response:) if response[:score].positive?
67
- response = _keyword_match?(array: hash['keywords'], response:) if response[:score].positive?
68
- response = _text_match?(type: 'title', text: hash['title'], response:) if response[:score].positive?
69
- response = _text_match?(type: 'abstract', text: hash['abstract'], response:) if response[:score].positive?
70
- # If the score is less than 3 then we have no confidence that it is a match
71
- return response if response[:score] <= 2
72
-
73
- # Set the confidence level based on the score
74
- response[:confidence] = if response[:score] > 10
75
- 'High'
76
- else
77
- (response[:score] > 5 ? 'Medium' : 'Low')
78
- end
79
- response
80
- end
81
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
82
-
83
- private
84
-
85
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
86
- # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
87
- def _extract_dmp_details(dmp:)
88
- return nil unless dmp.is_a?(Hash) && !dmp['title'].nil? && !dmp['contact'].nil?
89
-
90
- projects = dmp.fetch('project', [{}])
91
- fundings = projects.map { |proj| proj.fetch('funding', []) }.flatten.compact.uniq
92
- hosts = dmp.fetch('dataset', []).map { |dset| dset.fetch('distribution', []).map { |d| d['host'] } }
93
- people = [dmp['contact']]
94
- people << dmp.fetch('contributor', [])
95
-
96
- # Extract all of the important bits about the DMP
97
- @details_hash = {
98
- created: dmp.fetch('created', Time.now.iso8601),
99
- title: _cleanse_text(text: projects&.first&.fetch('title', dmp['title'])),
100
- abstract: _cleanse_text(text: projects&.first&.fetch('description', dmp['description'])),
101
- keywords: dmp.fetch('dataset', []).map { |ds| ds.fetch('keyword', []) }.flatten.compact.uniq,
102
- identifiers: [dmp.fetch('dmp_id', {})['identifier']],
103
- last_names: [],
104
- orcids: [],
105
- affiliation_ids: [],
106
- affiliations: [],
107
- funder_names: [],
108
- funder_ids: [],
109
- opportunity_ids: [],
110
- grant_ids: [],
111
- repositories: []
112
- }
113
- _extract_people(array: people&.flatten&.compact&.uniq)
114
- _extract_funding(array: fundings)
115
- _extract_repositories(repos: hosts.flatten.compact.uniq)
116
-
117
- # Clean up the results by flattening and removing duplicates from the Arrays
118
- @details_hash.each_key do |key|
119
- @details_hash[key] = @details_hash[key].flatten.compact.uniq if @details_hash[key].is_a?(Array)
57
+ scoring = @dmps.map do |entry|
58
+ dmp = entry.fetch('_source', {})
59
+ response = { dmp_id: dmp['_id'], confidence: 'None', score: 0, notes: [] }
60
+ response = _grants_match?(array: hash['fundings'], dmp:, response:)
61
+ return response if response[:confidence] != 'None'
62
+
63
+ response = _opportunities_match?(array: hash['fundings'], dmp:, response:)
64
+ response = _orcids_match?(array: hash['people'], dmp:, response:)
65
+ response = _last_name_and_affiliation_match?(array: hash['people'], dmp:, response:)
66
+
67
+ # Only process the following if we had some matching contributors, affiliations or opportuniy nbrs
68
+ response = _repository_match?(array: hash['repositories'], dmp:, response:) if response[:score].positive?
69
+ # response = _keyword_match?(array: hash['keywords'], response:) if response[:score].positive?
70
+ response = _text_match?(type: 'title', text: hash['title'], dmp:, response:) if response[:score].positive?
71
+ response = _text_match?(type: 'abstract', text: hash['abstract'], dmp:, response:) if response[:score].positive?
72
+ # If the score is less than 3 then we have no confidence that it is a match
73
+ return nil if response[:score] <= 2
74
+
75
+ # Set the confidence level based on the score
76
+ response[:confidence] = if response[:score] > 10
77
+ 'High'
78
+ else
79
+ (response[:score] > 5 ? 'Medium' : 'Low')
80
+ end
81
+ response
120
82
  end
121
- @logger&.debug(message: 'Extracted the following from the DMP', details: @details_hash)
122
- end
123
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
124
- # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
125
83
 
126
- # Extract all of the funding information
127
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
128
- def _extract_funding(array:)
129
- return [] unless array.is_a?(Array)
130
-
131
- array.each do |funding|
132
- next unless funding.is_a?(Hash)
133
-
134
- funder_id = funding.fetch('funder_id', {})
135
- ror = funder_id['identifier'] if funder_id['type']&.downcase&.strip == 'ror'
136
- fundref = ror.nil? ? funder_id['identifier']&.downcase&.strip : ROR_FUNDREF_ID_CROSSWALK[:"#{ror}"]
137
- opportunity = funding.fetch('dmproadmap_funding_opportunity_id', {})['identifier']
138
- grant = funding.fetch('grant_id', {})['identifier']
139
-
140
- @details_hash[:identifiers] << ror&.downcase&.strip
141
- @details_hash[:identifiers] << fundref&.downcase&.strip
142
- @details_hash[:identifiers] << grant&.downcase&.strip
143
- @details_hash[:identifiers] << grant&.split('/')&.last&.downcase&.strip
144
- @details_hash[:identifiers] << opportunity&.downcase&.strip
145
-
146
- @details_hash[:funder_names] << funding['name']&.downcase&.split(' (').first&.strip
147
- @details_hash[:funder_ids] << fundref
148
- @details_hash[:opportunity_ids] << opportunity&.downcase&.strip
149
- @details_hash[:grant_ids] << [grant&.downcase&.strip, grant&.split('/')&.last&.downcase&.strip]
150
- end
151
- array
84
+ # TODO: introduce a tie-breaker here (maybe the closes to the project_end date)
85
+ scoring.compact.sort { |a, b| b[:score] <=> a[:score] }&.first
152
86
  end
153
87
  # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
154
88
 
155
- # Extract all of the ORCIDs, last names, and affiliation ids and names
156
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
157
- def _extract_people(array:)
158
- return [] unless array.is_a?(Array)
159
-
160
- array.each do |entry|
161
- next unless entry.is_a?(Hash)
162
-
163
- id = entry.fetch('contributor_id', entry.fetch('contact_id', {}))['identifier']&.downcase&.strip
164
- affil = entry.fetch('dmproadmap_affiliation', {})
165
- ror = affil.fetch('affiliation_id', {})['identifier']&.downcase&.strip
166
- name = entry.fetch('name', '')&.downcase&.strip
167
- last_name = name.include?(', ') ? name.split(', ').first : name.split.last
168
-
169
- @details_hash[:orcids] << id unless id.nil?
170
- @details_hash[:identifiers] << [id, ror&.downcase&.strip]
171
- @details_hash[:last_names] << last_name
172
- @details_hash[:affiliation_ids] << ror
173
- @details_hash[:affiliations] << affil.fetch('name', '')&.split(' (')&.first&.downcase&.strip
174
- end
175
- array
176
- end
177
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
178
-
179
- # Extract all of the re3data ids, URLs and names
180
- # rubocop:disable Metrics/AbcSize
181
- def _extract_repositories(repos:)
182
- return [] unless repos.is_a?(Array)
183
-
184
- repos.each do |repo|
185
- next unless repo.is_a?(Hash)
186
-
187
- @details_hash[:identifiers] << [
188
- repo['url']&.downcase&.strip, repo.fetch('dmproadmap_host_id', {})['identifier']&.downcase&.strip
189
- ]
190
- @details_hash[:repositories] << repo.fetch('name', '')&.downcase&.strip
191
- end
192
- repos
193
- end
194
- # rubocop:enable Metrics/AbcSize
89
+ private
195
90
 
196
91
  # Returns whether or not the incoming grant id(s) match the DMPs grant id. Expecting:
197
92
  # [
198
93
  # { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
199
94
  # ]
200
95
  # rubocop:disable Metrics/AbcSize
201
- def _grants_match?(array:, response:)
202
- return response unless array.is_a?(Array) && response.is_a?(Hash)
96
+ def _grants_match?(array:, dmp:, response:)
97
+ return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
98
+ return response unless dmp['grant_ids'].is_a?(Array) && !dmp['grant_ids'].empty?
203
99
 
204
100
  ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
205
101
  .map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
206
102
  .flatten.compact.uniq
207
103
 
208
- matched = _compare_arrays(array_a: @details_hash.fetch(:grant_ids, []), array_b: ids)
104
+ matched = _compare_arrays(array_a: dmp['grant_ids'], array_b: ids)
209
105
  return response if matched <= 0
210
106
 
211
107
  response[:confidence] = 'Absolute'
@@ -220,14 +116,15 @@ module Uc3DmpId
220
116
  # { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
221
117
  # ]
222
118
  # rubocop:disable Metrics/AbcSize
223
- def _opportunities_match?(array:, response:)
224
- return response unless array.is_a?(Array) && response.is_a?(Hash)
119
+ def _opportunities_match?(array:, dmp:, response:)
120
+ return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
121
+ return response unless dmp['funder_opportunity_ids'].is_a?(Array) && !dmp['funder_opportunity_ids'].empty?
225
122
 
226
123
  ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
227
124
  .map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
228
125
  .flatten.compact.uniq
229
126
 
230
- matched = _compare_arrays(array_a: @details_hash.fetch(:opportunity_ids, []), array_b: ids)
127
+ matched = _compare_arrays(array_a: dmp['funder_opportunity_ids'], array_b: ids)
231
128
  return response if matched <= 0
232
129
 
233
130
  response[:score] += 5
@@ -245,14 +142,15 @@ module Uc3DmpId
245
142
  # }
246
143
  # ]
247
144
  # rubocop:disable Metrics/AbcSize
248
- def _orcids_match?(array:, response:)
249
- return response unless array.is_a?(Array) && response.is_a?(Hash)
145
+ def _orcids_match?(array:, dmp:, response:)
146
+ return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
147
+ return response unless dmp['people_ids'].is_a?(Array) && !dmp['people_ids'].empty?
250
148
 
251
149
  ids = array.select { |repo| repo.is_a?(Hash) }
252
150
  .map { |person| person['id']&.downcase&.strip }
253
151
  .flatten.compact.uniq
254
152
 
255
- matched = _compare_arrays(array_a: @details_hash.fetch(:identifiers, []), array_b: ids)
153
+ matched = _compare_arrays(array_a: dmp['people_ids'], array_b: ids)
256
154
  return response if matched <= 0
257
155
 
258
156
  response[:score] += (matched * 2)
@@ -270,8 +168,9 @@ module Uc3DmpId
270
168
  # }
271
169
  # ]
272
170
  # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
273
- def _last_name_and_affiliation_match?(array:, response:)
274
- return response unless array.is_a?(Array) && response.is_a?(Hash)
171
+ def _last_name_and_affiliation_match?(array:, dmp:, response:)
172
+ return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
173
+ return response unless dmp['people'].is_a?(Array) && !dmp['people'].empty?
275
174
 
276
175
  array = array.select { |repo| repo.is_a?(Hash) }
277
176
  affiliations = array.map { |person| person['affiliation'] }&.flatten&.compact&.uniq
@@ -280,10 +179,10 @@ module Uc3DmpId
280
179
  affil_names = affiliations.map { |affil| affil['name']&.downcase&.strip }&.flatten&.compact&.uniq
281
180
 
282
181
  # Check the person last names and affiliation name and RORs
283
- last_names_matched = _compare_arrays(array_a: @details_hash.fetch(:last_names, []), array_b: last_names)
284
- rors_matched = _compare_arrays(array_a: @details_hash.fetch(:affiliation_ids, []), array_b: rors)
285
- affil_names_matched = _compare_arrays(array_a: @details_hash.fetch(:affiliations, []), array_b: affil_names)
286
- return response if last_names_matched <= 0
182
+ last_names_matched = _compare_arrays(array_a: dmp['people'], array_b: last_names)
183
+ rors_matched = _compare_arrays(array_a: dmp.fetch('affiliation_ids', []), array_b: rors)
184
+ affil_names_matched = _compare_arrays(array_a: dmp.fetch('affiliations', []), array_b: affil_names)
185
+ return response if last_names_matched <= 0 && rors_matched <= 0 && affil_names_matched <= 0
287
186
 
288
187
  response[:score] += last_names_matched + rors_matched + affil_names_matched
289
188
  response[:notes] << 'contributor names and affiliations matched'
@@ -296,15 +195,16 @@ module Uc3DmpId
296
195
  # { id: ["http://some.repo.org", "https://doi.org/re3data123"], name: "Repo" }
297
196
  # ]
298
197
  # rubocop:disable Metrics/AbcSize
299
- def _repository_match?(array:, response:)
300
- return response unless array.is_a?(Array) && response.is_a?(Hash)
198
+ def _repository_match?(array:, dmp:, response:)
199
+ return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
200
+ return response unless dmp['repositories'].is_a?(Array) && !dmp['repositories'].empty?
301
201
 
302
202
  # We only care about repositories with ids/urls
303
203
  ids = array.select { |repo| repo.is_a?(Hash) }
304
204
  .map { |repo| repo['id'].map { |id| id&.downcase&.strip } }
305
205
  .flatten.compact.uniq
306
206
 
307
- matched = _compare_arrays(array_a: @details_hash.fetch(:identifiers, []), array_b: ids)
207
+ matched = _compare_arrays(array_a: dmp['repositories'], array_b: ids)
308
208
  return response if matched <= 0
309
209
 
310
210
  response[:score] += matched
@@ -313,33 +213,19 @@ module Uc3DmpId
313
213
  end
314
214
  # rubocop:enable Metrics/AbcSize
315
215
 
316
- # Returns whether or not the list of keywords exist in the DMP. Expecting:
317
- # keywords: ["foo", "bar"]
318
- def _keyword_match?(array:, response:)
319
- return response unless array.is_a?(Array) && response.is_a?(Hash)
320
-
321
- keywords = array.map { |word| word&.downcase&.strip }&.flatten&.compact&.uniq
322
- matched = _compare_arrays(array_a: @details_hash.fetch(:keywords, []), array_b: keywords)
323
- return response if matched <= 0
324
-
325
- response[:score] += 1
326
- response[:notes] << 'keywords matched'
327
- response
328
- end
329
-
330
216
  # Uses an NLP library to determine if the :text matches the DMP/Project :title or :description
331
217
  # rubocop:disable Metrics/AbcSize
332
- def _text_match?(text:, response:, type: 'title')
333
- return response unless response.is_a?(Hash) && text.is_a?(String) && !text.strip.empty? &&
334
- !@details_hash[type.to_sym].nil?
218
+ def _text_match?(text:, dmp:, response:, type: 'title')
219
+ return response unless response.is_a?(Hash) && text.is_a?(String) && !text.strip.empty? && dmp.is_a?(Hash)
335
220
 
336
221
  nlp_processor = Text::WhiteSimilarity.new
337
222
  cleansed = _cleanse_text(text:)
338
223
 
224
+ dmp_val = type == 'title' ? dmp['title'] : dmp['description']
339
225
  details = {
340
- "dmp_#{type}": @details_hash[type.to_sym],
226
+ "dmp_#{type}": dmp_val,
341
227
  "incoming_#{type}": cleansed,
342
- nlp_score: nlp_processor.similarity(@details_hash[type.to_sym], cleansed)
228
+ nlp_score: nlp_processor.similarity(dmp_val, cleansed)
343
229
  }
344
230
  @logger&.debug(message: 'Text::WhiteSimilarity score', details:)
345
231
  return response if details[:nlp_score] < 0.5
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Uc3DmpId
4
- VERSION = '0.1.24'
4
+ VERSION = '0.1.25'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: uc3-dmp-id
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.24
4
+ version: 0.1.25
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brian Riley
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-03-08 00:00:00.000000000 Z
11
+ date: 2024-03-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: json