uc3-dmp-id 0.1.24 → 0.1.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/uc3-dmp-id/comparator.rb +93 -216
- data/lib/uc3-dmp-id/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f692820577dff088fec1d1df2e4975a00a6b30952fa66979b1fa8449523d092e
|
4
|
+
data.tar.gz: 8bf2326f3a6fddf9454e915c596eb40231c939337e517de73f6ea998f04a9d6f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fcc0689438e54715882ed7c30aa7d608e9c3abaa1e61fec7589a213c7665f257be933f93a482dc4d589756517c94a7f9b4803930f0a2a1bd51d877cd890b8a90
|
7
|
+
data.tar.gz: df18ed7a17053c9527e6a111150b81c2a6d285bda1263b854b0cc3fae5b703e3ef9dd3213b62230431c898c948cf80f6e821da3515089e6cfc156622223bdf7e
|
@@ -9,203 +9,102 @@ module Uc3DmpId
|
|
9
9
|
# Class that compares incoming data from an external source to the DMP
|
10
10
|
# It determines if they are likely related and applies a confidence rating
|
11
11
|
class Comparator
|
12
|
-
|
12
|
+
MSG_MISSING_DMPS = 'No DMPs were defined. Expected an Array of OpenSearch documents!'
|
13
13
|
|
14
14
|
STOP_WORDS = %w[a an and if of or the then they].freeze
|
15
15
|
|
16
16
|
# See the bottom of this file for a hard-coded crosswalk between Crossref funder ids and ROR ids
|
17
17
|
# Some APIs do not support ROR fully for funder ids, so we need to be able to reference both
|
18
18
|
|
19
|
-
attr_accessor :
|
19
|
+
attr_accessor :dmps, :logger
|
20
20
|
|
21
|
+
# Expecting an Array of OpenSearch documents as :dmps in the :args
|
21
22
|
def initialize(**args)
|
22
23
|
@logger = args[:logger]
|
23
24
|
@details_hash = {}
|
24
25
|
|
25
|
-
@
|
26
|
-
|
27
|
-
raise ComparatorError, MSG_MISSING_DMP if @details_hash.empty?
|
26
|
+
@dmps = args.fetch(:dmps, [])
|
27
|
+
raise ComparatorError, MSG_MISSING_DMPS if @dmps.empty?
|
28
28
|
end
|
29
29
|
|
30
30
|
# Compare the incoming hash with the DMP details that were gathered during initialization.
|
31
31
|
#
|
32
|
-
# The Hash should
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
32
|
+
# The incoming Hash should match the documents found in OpenSearch. For example:
|
33
|
+
# {
|
34
|
+
# "people": ["john doe", "jdoe@example.com"],
|
35
|
+
# "people_ids": ["https://orcid.org/0000-0000-0000-ZZZZ"],
|
36
|
+
# "affiliations": ["example college"],
|
37
|
+
# "affiliation_ids": ["https://ror.org/00000zzzz"],
|
38
|
+
# "funder_ids": ["https://doi.org/10.13039/00000000000"],
|
39
|
+
# "funders": ["example funder (example.gov)"],
|
40
|
+
# "funder_opportunity_ids": ["485yt8325ty"],
|
41
|
+
# "grant_ids": [],
|
42
|
+
# "funding_status": "planned",
|
43
|
+
# "dmp_id": "doi.org/11.22222/A1B2c3po",
|
44
|
+
# "title": "example data management plan",
|
45
|
+
# "visibility": "private",
|
46
|
+
# "featured": 0,
|
47
|
+
# "description": "the example project abstract",
|
48
|
+
# "project_start": "2022-01-03",
|
49
|
+
# "project_end": "2024-12-23",
|
50
|
+
# "created": "2023-08-07",
|
51
|
+
# "modified": "2023-08-07",
|
52
|
+
# "registered": "2023-08-07"
|
53
|
+
# }
|
51
54
|
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
52
55
|
def compare(hash:)
|
53
|
-
|
54
|
-
return response unless hash.is_a?(Hash) && !hash['title'].nil?
|
56
|
+
return [] unless hash.is_a?(Hash) && !hash['title'].nil?
|
55
57
|
|
56
58
|
# Compare the grant ids. If we have a match return the response immediately since that is
|
57
59
|
# a very positive match!
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
private
|
84
|
-
|
85
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
86
|
-
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
87
|
-
def _extract_dmp_details(dmp:)
|
88
|
-
return nil unless dmp.is_a?(Hash) && !dmp['title'].nil? && !dmp['contact'].nil?
|
89
|
-
|
90
|
-
projects = dmp.fetch('project', [{}])
|
91
|
-
fundings = projects.map { |proj| proj.fetch('funding', []) }.flatten.compact.uniq
|
92
|
-
hosts = dmp.fetch('dataset', []).map { |dset| dset.fetch('distribution', []).map { |d| d['host'] } }
|
93
|
-
people = [dmp['contact']]
|
94
|
-
people << dmp.fetch('contributor', [])
|
95
|
-
|
96
|
-
# Extract all of the important bits about the DMP
|
97
|
-
@details_hash = {
|
98
|
-
created: dmp.fetch('created', Time.now.iso8601),
|
99
|
-
title: _cleanse_text(text: projects&.first&.fetch('title', dmp['title'])),
|
100
|
-
abstract: _cleanse_text(text: projects&.first&.fetch('description', dmp['description'])),
|
101
|
-
keywords: dmp.fetch('dataset', []).map { |ds| ds.fetch('keyword', []) }.flatten.compact.uniq,
|
102
|
-
identifiers: [dmp.fetch('dmp_id', {})['identifier']],
|
103
|
-
last_names: [],
|
104
|
-
orcids: [],
|
105
|
-
affiliation_ids: [],
|
106
|
-
affiliations: [],
|
107
|
-
funder_names: [],
|
108
|
-
funder_ids: [],
|
109
|
-
opportunity_ids: [],
|
110
|
-
grant_ids: [],
|
111
|
-
repositories: []
|
112
|
-
}
|
113
|
-
_extract_people(array: people&.flatten&.compact&.uniq)
|
114
|
-
_extract_funding(array: fundings)
|
115
|
-
_extract_repositories(repos: hosts.flatten.compact.uniq)
|
116
|
-
|
117
|
-
# Clean up the results by flattening and removing duplicates from the Arrays
|
118
|
-
@details_hash.each_key do |key|
|
119
|
-
@details_hash[key] = @details_hash[key].flatten.compact.uniq if @details_hash[key].is_a?(Array)
|
120
|
-
end
|
121
|
-
@logger&.debug(message: 'Extracted the following from the DMP', details: @details_hash)
|
122
|
-
end
|
123
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
124
|
-
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
125
|
-
|
126
|
-
# Extract all of the funding information
|
127
|
-
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
128
|
-
def _extract_funding(array:)
|
129
|
-
return [] unless array.is_a?(Array)
|
130
|
-
|
131
|
-
array.each do |funding|
|
132
|
-
next unless funding.is_a?(Hash)
|
133
|
-
|
134
|
-
funder_id = funding.fetch('funder_id', {})
|
135
|
-
ror = funder_id['identifier'] if funder_id['type']&.downcase&.strip == 'ror'
|
136
|
-
fundref = ror.nil? ? funder_id['identifier']&.downcase&.strip : ROR_FUNDREF_ID_CROSSWALK[:"#{ror}"]
|
137
|
-
opportunity = funding.fetch('dmproadmap_funding_opportunity_id', {})['identifier']
|
138
|
-
grant = funding.fetch('grant_id', {})['identifier']
|
139
|
-
|
140
|
-
@details_hash[:identifiers] << ror&.downcase&.strip
|
141
|
-
@details_hash[:identifiers] << fundref&.downcase&.strip
|
142
|
-
@details_hash[:identifiers] << grant&.downcase&.strip
|
143
|
-
@details_hash[:identifiers] << grant&.split('/')&.last&.downcase&.strip
|
144
|
-
@details_hash[:identifiers] << opportunity&.downcase&.strip
|
145
|
-
|
146
|
-
@details_hash[:funder_names] << funding['name']&.downcase&.split(' (').first&.strip
|
147
|
-
@details_hash[:funder_ids] << fundref
|
148
|
-
@details_hash[:opportunity_ids] << opportunity&.downcase&.strip
|
149
|
-
@details_hash[:grant_ids] << [grant&.downcase&.strip, grant&.split('/')&.last&.downcase&.strip]
|
60
|
+
scoring = @dmps.map do |entry|
|
61
|
+
dmp = entry.fetch('_source', {})
|
62
|
+
response = { dmp_id: dmp['_id'], confidence: 'None', score: 0, notes: [] }
|
63
|
+
response = _grants_match?(array: hash.fetch('grant_ids', []), dmp:, response:)
|
64
|
+
return response if response[:confidence] != 'None'
|
65
|
+
|
66
|
+
response = _opportunities_match?(array: hash.fetch('funder_opportunity_ids', []), dmp:, response:)
|
67
|
+
response = _orcids_match?(array: hash.fetch('people_ids', []), dmp:, response:)
|
68
|
+
response = _last_name_and_affiliation_match?(hash:, dmp:, response:)
|
69
|
+
|
70
|
+
# Only process the following if we had some matching contributors, affiliations or opportuniy nbrs
|
71
|
+
response = _repository_match?(hash:, dmp:, response:) if response[:score].positive?
|
72
|
+
# response = _keyword_match?(array: hash['keywords'], response:) if response[:score].positive?
|
73
|
+
response = _text_match?(type: 'title', text: hash['title'], dmp:, response:) if response[:score].positive?
|
74
|
+
response = _text_match?(type: 'abstract', text: hash['description'], dmp:, response:) if response[:score].positive?
|
75
|
+
# If the score is less than 3 then we have no confidence that it is a match
|
76
|
+
return nil if response[:score] <= 2
|
77
|
+
|
78
|
+
# Set the confidence level based on the score
|
79
|
+
response[:confidence] = if response[:score] > 10
|
80
|
+
'High'
|
81
|
+
else
|
82
|
+
(response[:score] > 5 ? 'Medium' : 'Low')
|
83
|
+
end
|
84
|
+
response
|
150
85
|
end
|
151
|
-
array
|
152
|
-
end
|
153
|
-
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
154
86
|
|
155
|
-
|
156
|
-
|
157
|
-
def _extract_people(array:)
|
158
|
-
return [] unless array.is_a?(Array)
|
159
|
-
|
160
|
-
array.each do |entry|
|
161
|
-
next unless entry.is_a?(Hash)
|
162
|
-
|
163
|
-
id = entry.fetch('contributor_id', entry.fetch('contact_id', {}))['identifier']&.downcase&.strip
|
164
|
-
affil = entry.fetch('dmproadmap_affiliation', {})
|
165
|
-
ror = affil.fetch('affiliation_id', {})['identifier']&.downcase&.strip
|
166
|
-
name = entry.fetch('name', '')&.downcase&.strip
|
167
|
-
last_name = name.include?(', ') ? name.split(', ').first : name.split.last
|
168
|
-
|
169
|
-
@details_hash[:orcids] << id unless id.nil?
|
170
|
-
@details_hash[:identifiers] << [id, ror&.downcase&.strip]
|
171
|
-
@details_hash[:last_names] << last_name
|
172
|
-
@details_hash[:affiliation_ids] << ror
|
173
|
-
@details_hash[:affiliations] << affil.fetch('name', '')&.split(' (')&.first&.downcase&.strip
|
174
|
-
end
|
175
|
-
array
|
87
|
+
# TODO: introduce a tie-breaker here (maybe the closes to the project_end date)
|
88
|
+
scoring.compact.sort { |a, b| b[:score] <=> a[:score] }&.first
|
176
89
|
end
|
177
90
|
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
178
91
|
|
179
|
-
|
180
|
-
# rubocop:disable Metrics/AbcSize
|
181
|
-
def _extract_repositories(repos:)
|
182
|
-
return [] unless repos.is_a?(Array)
|
183
|
-
|
184
|
-
repos.each do |repo|
|
185
|
-
next unless repo.is_a?(Hash)
|
186
|
-
|
187
|
-
@details_hash[:identifiers] << [
|
188
|
-
repo['url']&.downcase&.strip, repo.fetch('dmproadmap_host_id', {})['identifier']&.downcase&.strip
|
189
|
-
]
|
190
|
-
@details_hash[:repositories] << repo.fetch('name', '')&.downcase&.strip
|
191
|
-
end
|
192
|
-
repos
|
193
|
-
end
|
194
|
-
# rubocop:enable Metrics/AbcSize
|
92
|
+
private
|
195
93
|
|
196
94
|
# Returns whether or not the incoming grant id(s) match the DMPs grant id. Expecting:
|
197
95
|
# [
|
198
96
|
# { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
|
199
97
|
# ]
|
200
98
|
# rubocop:disable Metrics/AbcSize
|
201
|
-
def _grants_match?(array:, response:)
|
202
|
-
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
99
|
+
def _grants_match?(array:, dmp:, response:)
|
100
|
+
return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
|
101
|
+
return response unless dmp['grant_ids'].is_a?(Array) && !dmp['grant_ids'].empty?
|
203
102
|
|
204
103
|
ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
|
205
104
|
.map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
|
206
105
|
.flatten.compact.uniq
|
207
106
|
|
208
|
-
matched = _compare_arrays(array_a:
|
107
|
+
matched = _compare_arrays(array_a: dmp['grant_ids'], array_b: ids)
|
209
108
|
return response if matched <= 0
|
210
109
|
|
211
110
|
response[:confidence] = 'Absolute'
|
@@ -220,14 +119,15 @@ module Uc3DmpId
|
|
220
119
|
# { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
|
221
120
|
# ]
|
222
121
|
# rubocop:disable Metrics/AbcSize
|
223
|
-
def _opportunities_match?(array:, response:)
|
224
|
-
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
122
|
+
def _opportunities_match?(array:, dmp:, response:)
|
123
|
+
return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
|
124
|
+
return response unless dmp['funder_opportunity_ids'].is_a?(Array) && !dmp['funder_opportunity_ids'].empty?
|
225
125
|
|
226
126
|
ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
|
227
127
|
.map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
|
228
128
|
.flatten.compact.uniq
|
229
129
|
|
230
|
-
matched = _compare_arrays(array_a:
|
130
|
+
matched = _compare_arrays(array_a: dmp['funder_opportunity_ids'], array_b: ids)
|
231
131
|
return response if matched <= 0
|
232
132
|
|
233
133
|
response[:score] += 5
|
@@ -245,14 +145,15 @@ module Uc3DmpId
|
|
245
145
|
# }
|
246
146
|
# ]
|
247
147
|
# rubocop:disable Metrics/AbcSize
|
248
|
-
def _orcids_match?(array:, response:)
|
249
|
-
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
148
|
+
def _orcids_match?(array:, dmp:, response:)
|
149
|
+
return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
|
150
|
+
return response unless dmp['people_ids'].is_a?(Array) && !dmp['people_ids'].empty?
|
250
151
|
|
251
152
|
ids = array.select { |repo| repo.is_a?(Hash) }
|
252
153
|
.map { |person| person['id']&.downcase&.strip }
|
253
154
|
.flatten.compact.uniq
|
254
155
|
|
255
|
-
matched = _compare_arrays(array_a:
|
156
|
+
matched = _compare_arrays(array_a: dmp['people_ids'], array_b: ids)
|
256
157
|
return response if matched <= 0
|
257
158
|
|
258
159
|
response[:score] += (matched * 2)
|
@@ -262,28 +163,21 @@ module Uc3DmpId
|
|
262
163
|
# rubocop:enable Metrics/AbcSize
|
263
164
|
|
264
165
|
# Returns whether or not the inciming list of creators/contributors match those on the DMP. Expecting:
|
265
|
-
#
|
266
|
-
#
|
267
|
-
#
|
268
|
-
#
|
269
|
-
#
|
270
|
-
# }
|
271
|
-
# ]
|
166
|
+
# {
|
167
|
+
# people: ["john doe", "jdoe@example.com"],
|
168
|
+
# affiliations: ["example college"],
|
169
|
+
# affiliation_ids: ["https://ror.org/blah"]
|
170
|
+
# }
|
272
171
|
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
273
|
-
def _last_name_and_affiliation_match?(
|
274
|
-
return response unless
|
275
|
-
|
276
|
-
array = array.select { |repo| repo.is_a?(Hash) }
|
277
|
-
affiliations = array.map { |person| person['affiliation'] }&.flatten&.compact&.uniq
|
278
|
-
last_names = array.map { |person| person['last_name']&.downcase&.strip }&.flatten&.compact&.uniq
|
279
|
-
rors = affiliations.map { |affil| affil['id']&.downcase&.strip }&.flatten&.compact&.uniq
|
280
|
-
affil_names = affiliations.map { |affil| affil['name']&.downcase&.strip }&.flatten&.compact&.uniq
|
172
|
+
def _last_name_and_affiliation_match?(hash:, dmp:, response:)
|
173
|
+
return response unless hash.is_a?(Hash) && dmp.is_a?(Hash) && response.is_a?(Hash)
|
174
|
+
return response unless hash['people'].is_a?(Array) && !dmp['people'].empty?
|
281
175
|
|
282
176
|
# Check the person last names and affiliation name and RORs
|
283
|
-
last_names_matched = _compare_arrays(array_a:
|
284
|
-
rors_matched = _compare_arrays(array_a:
|
285
|
-
affil_names_matched = _compare_arrays(array_a:
|
286
|
-
return response if last_names_matched <= 0
|
177
|
+
last_names_matched = _compare_arrays(array_a: dmp['people'], array_b: hash['people'])
|
178
|
+
rors_matched = _compare_arrays(array_a: dmp.fetch('affiliation_ids', []), array_b: hash['affiliation_ids'])
|
179
|
+
affil_names_matched = _compare_arrays(array_a: dmp.fetch('affiliations', []), array_b: hash['affiliations'])
|
180
|
+
return response if last_names_matched <= 0 && rors_matched <= 0 && affil_names_matched <= 0
|
287
181
|
|
288
182
|
response[:score] += last_names_matched + rors_matched + affil_names_matched
|
289
183
|
response[:notes] << 'contributor names and affiliations matched'
|
@@ -292,19 +186,16 @@ module Uc3DmpId
|
|
292
186
|
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
293
187
|
|
294
188
|
# Returns whether or not the incoming list of repositories match those defined in the DMP. Expecting:
|
295
|
-
#
|
296
|
-
#
|
297
|
-
#
|
189
|
+
# {
|
190
|
+
# repo_ids: ["http://some.repo.org", "https://doi.org/re3data123"],
|
191
|
+
# repos: ["repo"]
|
192
|
+
# }
|
298
193
|
# rubocop:disable Metrics/AbcSize
|
299
|
-
def _repository_match?(
|
300
|
-
return response unless
|
301
|
-
|
302
|
-
# We only care about repositories with ids/urls
|
303
|
-
ids = array.select { |repo| repo.is_a?(Hash) }
|
304
|
-
.map { |repo| repo['id'].map { |id| id&.downcase&.strip } }
|
305
|
-
.flatten.compact.uniq
|
194
|
+
def _repository_match?(hash:, dmp:, response:)
|
195
|
+
return response unless hash.is_a?(Hash) && dmp.is_a?(Hash) && response.is_a?(Hash)
|
196
|
+
return response unless hash['repo_ids'].is_a?(Array) && !dmp['repo_ids'].empty?
|
306
197
|
|
307
|
-
matched = _compare_arrays(array_a:
|
198
|
+
matched = _compare_arrays(array_a: dmp['repo_ids'], array_b: hash['repo_ids'])
|
308
199
|
return response if matched <= 0
|
309
200
|
|
310
201
|
response[:score] += matched
|
@@ -313,33 +204,19 @@ module Uc3DmpId
|
|
313
204
|
end
|
314
205
|
# rubocop:enable Metrics/AbcSize
|
315
206
|
|
316
|
-
# Returns whether or not the list of keywords exist in the DMP. Expecting:
|
317
|
-
# keywords: ["foo", "bar"]
|
318
|
-
def _keyword_match?(array:, response:)
|
319
|
-
return response unless array.is_a?(Array) && response.is_a?(Hash)
|
320
|
-
|
321
|
-
keywords = array.map { |word| word&.downcase&.strip }&.flatten&.compact&.uniq
|
322
|
-
matched = _compare_arrays(array_a: @details_hash.fetch(:keywords, []), array_b: keywords)
|
323
|
-
return response if matched <= 0
|
324
|
-
|
325
|
-
response[:score] += 1
|
326
|
-
response[:notes] << 'keywords matched'
|
327
|
-
response
|
328
|
-
end
|
329
|
-
|
330
207
|
# Uses an NLP library to determine if the :text matches the DMP/Project :title or :description
|
331
208
|
# rubocop:disable Metrics/AbcSize
|
332
|
-
def _text_match?(text:, response:, type: 'title')
|
333
|
-
return response unless response.is_a?(Hash) && text.is_a?(String) && !text.strip.empty? &&
|
334
|
-
!@details_hash[type.to_sym].nil?
|
209
|
+
def _text_match?(text:, dmp:, response:, type: 'title')
|
210
|
+
return response unless response.is_a?(Hash) && text.is_a?(String) && !text.strip.empty? && dmp.is_a?(Hash)
|
335
211
|
|
336
212
|
nlp_processor = Text::WhiteSimilarity.new
|
337
213
|
cleansed = _cleanse_text(text:)
|
338
214
|
|
215
|
+
dmp_val = type == 'title' ? dmp['title'] : dmp['description']
|
339
216
|
details = {
|
340
|
-
"dmp_#{type}":
|
217
|
+
"dmp_#{type}": dmp_val,
|
341
218
|
"incoming_#{type}": cleansed,
|
342
|
-
nlp_score: nlp_processor.similarity(
|
219
|
+
nlp_score: nlp_processor.similarity(dmp_val, cleansed)
|
343
220
|
}
|
344
221
|
@logger&.debug(message: 'Text::WhiteSimilarity score', details:)
|
345
222
|
return response if details[:nlp_score] < 0.5
|
data/lib/uc3-dmp-id/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: uc3-dmp-id
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.26
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brian Riley
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-03-
|
11
|
+
date: 2024-03-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|