RubyGems - uc3-dmp-id - Versions diffs - 0.1.24 → 0.1.25 - Mend

uc3-dmp-id 0.1.24 → 0.1.25

Files changed (4) hide show

checksums.yaml +4 -4
data/lib/uc3-dmp-id/comparator.rb +62 -176
data/lib/uc3-dmp-id/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 91c01361a882520a281db0da8aa029bcb480b4873b684a4a526f2542abda9ea0
-  data.tar.gz: b9ca235a0c706469b4568e74a6ecbec2b39e0b44f9298d781ea11a2dd9cde749
+  metadata.gz: 69c311d2bc8bd7acee827939e1d99c4fdf233ddf5e29386f682ac60f67478a6a
+  data.tar.gz: 829d403c8ada7d01f444494d163bee0fe1c34db320a93bb34abfcc98aebeb122
 SHA512:
-  metadata.gz: d4a5092b448ce3f1a1ad71ed53c5d81304780732a5766d35a2875d0ed244d07b660d25f156a8facd1c9a794d89e59f17915bf640692ba1266556892245b5cab9
-  data.tar.gz: 4e3d0e476d2479f075abd8bc0cb45028766a766e60a1fae4441a1ef68c451e58757c5cd33c4b3033ccce5784c23112b1f9cb9782b49fd17c8dd2b80f538a4d24
+  metadata.gz: 8dc439bf6244f758ceb5c1afb5c2d825b58705342b3a146dfcae9af7dcde86352705d703a2aa0e003922f99c61ec2757aaf7c0ea26279c12efdfedbb6cec8b1b
+  data.tar.gz: fd03efac1ba2cacc9be9d40334c835259461a84bb32715e5cbc9723c202d56a52aecdcc8d67ed29a1f050ee0bac853e973955f2211ee41a9da34984a5d13b342

data/lib/uc3-dmp-id/comparator.rb CHANGED Viewed

@@ -9,22 +9,22 @@ module Uc3DmpId
   # Class that compares incoming data from an external source to the DMP
   # It determines if they are likely related and applies a confidence rating
   class Comparator
-    MSG_MISSING_DMP = 'No DMP or the DMP did not contain enough information to use.'
+    MSG_MISSING_DMPS = 'No DMPs were defined. Expected an Array of OpenSearch documents!'
     STOP_WORDS = %w[a an and if of or the then they].freeze
     # See the bottom of this file for a hard-coded crosswalk between Crossref funder ids and ROR ids
     # Some APIs do not support ROR fully for funder ids, so we need to be able to reference both
-    attr_accessor :dmp, :details_hash, :logger
+    attr_accessor :dmps, :logger
+    # Expecting an Array of OpenSearch documents as :dmps in the :args
     def initialize(**args)
       @logger = args[:logger]
       @details_hash = {}
-      @dmp = args.fetch(:dmp, {})['dmp'].nil? ? args[:dmp] : args.fetch(:dmp, {})['dmp']
-      _extract_dmp_details(dmp:)
-      raise ComparatorError, MSG_MISSING_DMP if @details_hash.empty?
+      @dmps = args.fetch(:dmps, [])
+      raise ComparatorError, MSG_MISSING_DMPS if @dmps.empty?
     end
     # Compare the incoming hash with the DMP details that were gathered during initialization.
@@ -50,162 +50,58 @@ module Uc3DmpId
     #  }
     # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
     def compare(hash:)
-      response = { confidence: 'None', score: 0, notes: [] }
-      return response unless hash.is_a?(Hash) && !hash['title'].nil?
+      return [] unless hash.is_a?(Hash) && !hash['title'].nil?
       # Compare the grant ids. If we have a match return the response immediately since that is
       # a very positive match!
-      response = _grants_match?(array: hash['fundings'], response:)
-      return response if response[:confidence] != 'None'
-      response = _opportunities_match?(array: hash['fundings'], response:)
-      response = _orcids_match?(array: hash['people'], response:)
-      response = _last_name_and_affiliation_match?(array: hash['people'], response:)
-      # Only process the following if we had some matching contributors, affiliations or opportuniy nbrs
-      response = _repository_match?(array: hash['repositories'], response:) if response[:score].positive?
-      response = _keyword_match?(array: hash['keywords'], response:) if response[:score].positive?
-      response = _text_match?(type: 'title', text: hash['title'], response:) if response[:score].positive?
-      response = _text_match?(type: 'abstract', text: hash['abstract'], response:) if response[:score].positive?
-      # If the score is less than 3 then we have no confidence that it is a match
-      return response if response[:score] <= 2
-      # Set the confidence level based on the score
-      response[:confidence] = if response[:score] > 10
-                                'High'
-                              else
-                                (response[:score] > 5 ? 'Medium' : 'Low')
-                              end
-      response
-    end
-    # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-    private
-    # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
-    # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-    def _extract_dmp_details(dmp:)
-      return nil unless dmp.is_a?(Hash) && !dmp['title'].nil? && !dmp['contact'].nil?
-      projects = dmp.fetch('project', [{}])
-      fundings = projects.map { |proj| proj.fetch('funding', []) }.flatten.compact.uniq
-      hosts = dmp.fetch('dataset', []).map { |dset| dset.fetch('distribution', []).map { |d| d['host'] } }
-      people = [dmp['contact']]
-      people << dmp.fetch('contributor', [])
-      # Extract all of the important bits about the DMP
-      @details_hash = {
-        created: dmp.fetch('created', Time.now.iso8601),
-        title: _cleanse_text(text: projects&.first&.fetch('title', dmp['title'])),
-        abstract: _cleanse_text(text: projects&.first&.fetch('description', dmp['description'])),
-        keywords: dmp.fetch('dataset', []).map { |ds| ds.fetch('keyword', []) }.flatten.compact.uniq,
-        identifiers: [dmp.fetch('dmp_id', {})['identifier']],
-        last_names: [],
-        orcids: [],
-        affiliation_ids: [],
-        affiliations: [],
-        funder_names: [],
-        funder_ids: [],
-        opportunity_ids: [],
-        grant_ids: [],
-        repositories: []
-      }
-      _extract_people(array: people&.flatten&.compact&.uniq)
-      _extract_funding(array: fundings)
-      _extract_repositories(repos: hosts.flatten.compact.uniq)
-      # Clean up the results by flattening and removing duplicates from the Arrays
-      @details_hash.each_key do |key|
-        @details_hash[key] = @details_hash[key].flatten.compact.uniq if @details_hash[key].is_a?(Array)
+      scoring = @dmps.map do |entry|
+        dmp = entry.fetch('_source', {})
+        response = { dmp_id: dmp['_id'], confidence: 'None', score: 0, notes: [] }
+        response = _grants_match?(array: hash['fundings'], dmp:, response:)
+        return response if response[:confidence] != 'None'
+        response = _opportunities_match?(array: hash['fundings'], dmp:, response:)
+        response = _orcids_match?(array: hash['people'], dmp:, response:)
+        response = _last_name_and_affiliation_match?(array: hash['people'], dmp:, response:)
+        # Only process the following if we had some matching contributors, affiliations or opportuniy nbrs
+        response = _repository_match?(array: hash['repositories'], dmp:, response:) if response[:score].positive?
+        # response = _keyword_match?(array: hash['keywords'], response:) if response[:score].positive?
+        response = _text_match?(type: 'title', text: hash['title'], dmp:, response:) if response[:score].positive?
+        response = _text_match?(type: 'abstract', text: hash['abstract'], dmp:, response:) if response[:score].positive?
+        # If the score is less than 3 then we have no confidence that it is a match
+        return nil if response[:score] <= 2
+        # Set the confidence level based on the score
+        response[:confidence] = if response[:score] > 10
+                                  'High'
+                                else
+                                  (response[:score] > 5 ? 'Medium' : 'Low')
+                                end
+        response
       end
-      @logger&.debug(message: 'Extracted the following from the DMP', details: @details_hash)
-    end
-    # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
-    # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-    # Extract all of the funding information
-    # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-    def _extract_funding(array:)
-      return [] unless array.is_a?(Array)
-      array.each do |funding|
-        next unless funding.is_a?(Hash)
-        funder_id = funding.fetch('funder_id', {})
-        ror = funder_id['identifier'] if funder_id['type']&.downcase&.strip == 'ror'
-        fundref = ror.nil? ? funder_id['identifier']&.downcase&.strip : ROR_FUNDREF_ID_CROSSWALK[:"#{ror}"]
-        opportunity = funding.fetch('dmproadmap_funding_opportunity_id', {})['identifier']
-        grant = funding.fetch('grant_id', {})['identifier']
-        @details_hash[:identifiers] << ror&.downcase&.strip
-        @details_hash[:identifiers] << fundref&.downcase&.strip
-        @details_hash[:identifiers] << grant&.downcase&.strip
-        @details_hash[:identifiers] << grant&.split('/')&.last&.downcase&.strip
-        @details_hash[:identifiers] << opportunity&.downcase&.strip
-        @details_hash[:funder_names] << funding['name']&.downcase&.split(' (').first&.strip
-        @details_hash[:funder_ids] << fundref
-        @details_hash[:opportunity_ids] << opportunity&.downcase&.strip
-        @details_hash[:grant_ids] << [grant&.downcase&.strip, grant&.split('/')&.last&.downcase&.strip]
-      end
-      array
+      # TODO: introduce a tie-breaker here (maybe the closes to the project_end date)
+      scoring.compact.sort { |a, b| b[:score] <=> a[:score] }&.first
     end
     # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-    # Extract all of the ORCIDs, last names, and affiliation ids and names
-    # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-    def _extract_people(array:)
-      return [] unless array.is_a?(Array)
-      array.each do |entry|
-        next unless entry.is_a?(Hash)
-        id = entry.fetch('contributor_id', entry.fetch('contact_id', {}))['identifier']&.downcase&.strip
-        affil = entry.fetch('dmproadmap_affiliation', {})
-        ror = affil.fetch('affiliation_id', {})['identifier']&.downcase&.strip
-        name = entry.fetch('name', '')&.downcase&.strip
-        last_name = name.include?(', ') ? name.split(', ').first : name.split.last
-        @details_hash[:orcids] << id unless id.nil?
-        @details_hash[:identifiers] << [id, ror&.downcase&.strip]
-        @details_hash[:last_names] << last_name
-        @details_hash[:affiliation_ids] << ror
-        @details_hash[:affiliations] << affil.fetch('name', '')&.split(' (')&.first&.downcase&.strip
-      end
-      array
-    end
-    # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-    # Extract all of the re3data ids, URLs and names
-    # rubocop:disable Metrics/AbcSize
-    def _extract_repositories(repos:)
-      return [] unless repos.is_a?(Array)
-      repos.each do |repo|
-        next unless repo.is_a?(Hash)
-        @details_hash[:identifiers] << [
-          repo['url']&.downcase&.strip, repo.fetch('dmproadmap_host_id', {})['identifier']&.downcase&.strip
-        ]
-        @details_hash[:repositories] << repo.fetch('name', '')&.downcase&.strip
-      end
-      repos
-    end
-    # rubocop:enable Metrics/AbcSize
+    private
     # Returns whether or not the incoming grant id(s) match the DMPs grant id. Expecting:
     #    [
     #      { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
     #    ]
     # rubocop:disable Metrics/AbcSize
-    def _grants_match?(array:, response:)
-      return response unless array.is_a?(Array) && response.is_a?(Hash)
+    def _grants_match?(array:, dmp:, response:)
+      return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
+      return response unless dmp['grant_ids'].is_a?(Array) && !dmp['grant_ids'].empty?
       ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
                  .map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
                  .flatten.compact.uniq
-      matched = _compare_arrays(array_a: @details_hash.fetch(:grant_ids, []), array_b: ids)
+      matched = _compare_arrays(array_a: dmp['grant_ids'], array_b: ids)
       return response if matched <= 0
       response[:confidence] = 'Absolute'
@@ -220,14 +116,15 @@ module Uc3DmpId
     #      { id: "https://doi.org/crossref123", name: "Bar", grant: ["1234", "http://foo.bar/543"] }
     #    ]
     # rubocop:disable Metrics/AbcSize
-    def _opportunities_match?(array:, response:)
-      return response unless array.is_a?(Array) && response.is_a?(Hash)
+    def _opportunities_match?(array:, dmp:, response:)
+      return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
+      return response unless dmp['funder_opportunity_ids'].is_a?(Array) && !dmp['funder_opportunity_ids'].empty?
       ids = array.select { |funding| funding.is_a?(Hash) && funding['grant'].is_a?(Array) }
                  .map { |funding| funding['grant'].map { |id| id&.downcase&.strip } }
                  .flatten.compact.uniq
-      matched = _compare_arrays(array_a: @details_hash.fetch(:opportunity_ids, []), array_b: ids)
+      matched = _compare_arrays(array_a: dmp['funder_opportunity_ids'], array_b: ids)
       return response if matched <= 0
       response[:score] += 5
@@ -245,14 +142,15 @@ module Uc3DmpId
     #      }
     #    ]
     # rubocop:disable Metrics/AbcSize
-    def _orcids_match?(array:, response:)
-      return response unless array.is_a?(Array) && response.is_a?(Hash)
+    def _orcids_match?(array:, dmp:, response:)
+      return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
+      return response unless dmp['people_ids'].is_a?(Array) && !dmp['people_ids'].empty?
       ids = array.select { |repo| repo.is_a?(Hash) }
                  .map { |person| person['id']&.downcase&.strip }
                  .flatten.compact.uniq
-      matched = _compare_arrays(array_a: @details_hash.fetch(:identifiers, []), array_b: ids)
+      matched = _compare_arrays(array_a: dmp['people_ids'], array_b: ids)
       return response if matched <= 0
       response[:score] += (matched * 2)
@@ -270,8 +168,9 @@ module Uc3DmpId
     #      }
     #    ]
     # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-    def _last_name_and_affiliation_match?(array:, response:)
-      return response unless array.is_a?(Array) && response.is_a?(Hash)
+    def _last_name_and_affiliation_match?(array:, dmp:, response:)
+      return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
+      return response unless dmp['people'].is_a?(Array) && !dmp['people'].empty?
       array = array.select { |repo| repo.is_a?(Hash) }
       affiliations = array.map { |person| person['affiliation'] }&.flatten&.compact&.uniq
@@ -280,10 +179,10 @@ module Uc3DmpId
       affil_names = affiliations.map { |affil| affil['name']&.downcase&.strip }&.flatten&.compact&.uniq
       # Check the person last names and affiliation name and RORs
-      last_names_matched = _compare_arrays(array_a: @details_hash.fetch(:last_names, []), array_b: last_names)
-      rors_matched = _compare_arrays(array_a: @details_hash.fetch(:affiliation_ids, []), array_b: rors)
-      affil_names_matched = _compare_arrays(array_a: @details_hash.fetch(:affiliations, []), array_b: affil_names)
-      return response if last_names_matched <= 0
+      last_names_matched = _compare_arrays(array_a: dmp['people'], array_b: last_names)
+      rors_matched = _compare_arrays(array_a: dmp.fetch('affiliation_ids', []), array_b: rors)
+      affil_names_matched = _compare_arrays(array_a: dmp.fetch('affiliations', []), array_b: affil_names)
+      return response if last_names_matched <= 0 && rors_matched <= 0 && affil_names_matched <= 0
       response[:score] += last_names_matched + rors_matched + affil_names_matched
       response[:notes] << 'contributor names and affiliations matched'
@@ -296,15 +195,16 @@ module Uc3DmpId
     #      { id: ["http://some.repo.org", "https://doi.org/re3data123"], name: "Repo" }
     #    ]
     # rubocop:disable Metrics/AbcSize
-    def _repository_match?(array:, response:)
-      return response unless array.is_a?(Array) && response.is_a?(Hash)
+    def _repository_match?(array:, dmp:, response:)
+      return response unless array.is_a?(Array) && dmp.is_a?(Hash) && response.is_a?(Hash)
+      return response unless dmp['repositories'].is_a?(Array) && !dmp['repositories'].empty?
       # We only care about repositories with ids/urls
       ids = array.select { |repo| repo.is_a?(Hash) }
                  .map { |repo| repo['id'].map { |id| id&.downcase&.strip } }
                  .flatten.compact.uniq
-      matched = _compare_arrays(array_a: @details_hash.fetch(:identifiers, []), array_b: ids)
+      matched = _compare_arrays(array_a: dmp['repositories'], array_b: ids)
       return response if matched <= 0
       response[:score] += matched
@@ -313,33 +213,19 @@ module Uc3DmpId
     end
     # rubocop:enable Metrics/AbcSize
-    # Returns whether or not the list of keywords exist in the DMP. Expecting:
-    #     keywords: ["foo", "bar"]
-    def _keyword_match?(array:, response:)
-      return response unless array.is_a?(Array) && response.is_a?(Hash)
-      keywords = array.map { |word| word&.downcase&.strip }&.flatten&.compact&.uniq
-      matched = _compare_arrays(array_a: @details_hash.fetch(:keywords, []), array_b: keywords)
-      return response if matched <= 0
-      response[:score] += 1
-      response[:notes] << 'keywords matched'
-      response
-    end
     # Uses an NLP library to determine if the :text matches the DMP/Project :title or :description
     # rubocop:disable Metrics/AbcSize
-    def _text_match?(text:, response:, type: 'title')
-      return response unless response.is_a?(Hash) && text.is_a?(String) && !text.strip.empty? &&
-                             !@details_hash[type.to_sym].nil?
+    def _text_match?(text:, dmp:, response:, type: 'title')
+      return response unless response.is_a?(Hash) && text.is_a?(String) && !text.strip.empty? && dmp.is_a?(Hash)
       nlp_processor = Text::WhiteSimilarity.new
       cleansed = _cleanse_text(text:)
+      dmp_val = type == 'title' ? dmp['title'] : dmp['description']
       details = {
-        "dmp_#{type}": @details_hash[type.to_sym],
+        "dmp_#{type}": dmp_val,
         "incoming_#{type}": cleansed,
-        nlp_score: nlp_processor.similarity(@details_hash[type.to_sym], cleansed)
+        nlp_score: nlp_processor.similarity(dmp_val, cleansed)
       }
       @logger&.debug(message: 'Text::WhiteSimilarity score', details:)
       return response if details[:nlp_score] < 0.5

data/lib/uc3-dmp-id/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Uc3DmpId
-  VERSION = '0.1.24'
+  VERSION = '0.1.25'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: uc3-dmp-id
 version: !ruby/object:Gem::Version
-  version: 0.1.24
+  version: 0.1.25
 platform: ruby
 authors:
 - Brian Riley
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-03-08 00:00:00.000000000 Z
+date: 2024-03-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: json