RubyGems - discovery-indexer - Versions diffs - 0.10.1 → 0.10.2 - Mend

discovery-indexer 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/lib/discovery-indexer/collection.rb +48 -0
data/lib/{errors.rb → discovery-indexer/errors.rb} +0 -0
data/lib/discovery-indexer/general_mapper.rb +44 -0
data/lib/{logging.rb → discovery-indexer/logging.rb} +0 -0
data/lib/{reader → discovery-indexer/reader}/modsxml.rb +0 -9
data/lib/{reader → discovery-indexer/reader}/modsxml_reader.rb +0 -0
data/lib/{reader → discovery-indexer/reader}/purlxml.rb +1 -11
data/lib/{reader → discovery-indexer/reader}/purlxml_model.rb +0 -0
data/lib/{reader → discovery-indexer/reader}/purlxml_parser_strict.rb +8 -6
data/lib/{reader → discovery-indexer/reader}/purlxml_reader.rb +1 -3
data/lib/{version.rb → discovery-indexer/version.rb} +1 -1
data/lib/discovery-indexer.rb +10 -13
metadata +27 -15
data/lib/mapper/general_mapper.rb +0 -27
data/lib/reader/purlxml_parser.rb +0 -13
data/lib/writer/solr_client.rb +0 -113
data/lib/writer/solr_writer.rb +0 -54

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 41c0aca7bebdf8eea7c07d8c2d48944e8bcc88c4
-  data.tar.gz: 5d0b42289e313ea24e32b4f0f66dc762c8475efa
+  metadata.gz: 94eb6c9bdbd29fc02f9aece9351e6c4af77a59b1
+  data.tar.gz: bb54745bb7c03fb7a60559e55cc7804db706cd8b
 SHA512:
-  metadata.gz: 6d030fb91ba3fec33475e4c5fdd59aaed19e94b6178b6de53b4513be85f94655e420bb8c467bfcb8738f1e943e39b66856e10c7069a9513263d69b8eecaba2db
-  data.tar.gz: 7c489649c3b0b34332108f05e80b75646863b8251a5a431ed8557dad955b5efe58ca0e6edab9326800b7c2a7021b04ccbc4e5e6c9a9283ace69f691c44d3f31e
+  metadata.gz: 80a631460ec997ab2c92b90836bca19ff8a4fc12ab70f7bbd70684cda9a152f29aa8a1d6c6431bb914a2ffc8a8ea7af7cad9196c00f6dda902dcdaaace1a8202
+  data.tar.gz: e02780cf225013328439cbe55b4c890ceb6fba82e77b9f3dfe98a617aaa591c310bc7f97365b1f0dfe4a11c16400e7484dd0b03222a7e59edaa56d5442521dfb

data/lib/discovery-indexer/collection.rb ADDED Viewed

@@ -0,0 +1,48 @@
+module DiscoveryIndexer
+  # It caches the collection information such as name and catkey
+  class Collection
+    attr_reader :druid
+    delegate :present?, to: :collection_info
+    def initialize(druid)
+      @druid = druid
+    end
+    def searchworks_id
+      collection_info[:ckey] || druid
+    end
+    def title
+      collection_info[:title]
+    end
+    private
+    # Returns the collection name from cache, otherwise will fetch it from PURL.
+    #
+    # @param collection_druid [String]  is the druid for a collection e.g., ab123cd4567
+    # @return [Array<String>] the collection data or [] if there is no name and catkey or the object
+    #   is not a collection
+    def collection_info
+      from_purl || {}
+    end
+    # @param [String] collection_druid is the druid for a collection e.g., ab123cd4567
+    # @return [String] return the collection label from purl if available, nil otherwise
+    def from_purl
+      return unless purl_model
+      { title: purl_model.label, ckey: purl_model.catkey }
+    end
+    def purl_model
+      @purl_model ||= begin
+        DiscoveryIndexer::InputXml::Purlxml.new(druid).load
+      rescue => e
+        DiscoveryIndexer::Logging.logger.error "There is a problem in retrieving collection name and/or catkey for #{druid}. #{e.inspect}\n#{e.message }\n#{e.backtrace}"
+        nil
+      end
+    end
+  end
+end

data/lib/{errors.rb → discovery-indexer/errors.rb} RENAMED Viewed

File without changes

data/lib/discovery-indexer/general_mapper.rb ADDED Viewed

@@ -0,0 +1,44 @@
+module DiscoveryIndexer
+  class GeneralMapper
+    attr_reader :druid
+    # Initializes an instance from IndexMapper
+    # @param [String] druid e.g. ab123cd4567
+    # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
+    # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
+    # @param [Hash] collection_data represents a hash of collection_druid and catkey
+    # collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
+    def initialize(druid)
+      @druid = druid
+    end
+    # Create a Hash representing a Solr doc, with all MODS related fields populated.
+    # @return [Hash] Hash representing the Solr document
+    def convert_to_solr_doc
+      solr_doc = {}
+      solr_doc[:id] = druid
+      solr_doc[:title] = modsxml.sw_full_title
+      solr_doc
+    end
+    # It converts collection_druids list to a hash with names. If the druid doesn't
+    # have a collection name, it will be excluded from the hash
+    # @return [Hash] a hash for collection druid and its name
+    #   !{"ab123cd4567"=>"Collection 1", "ef123gh4567"=>"Collection 2"}
+    def collection_data
+      @collection_data ||= collection_druids.map do |cdruid|
+        DiscoveryIndexer::Collection.new(cdruid)
+      end
+    end
+    def collection_druids
+      purlxml.collection_druids
+    end
+    def modsxml
+      @modsxml ||= DiscoveryIndexer::InputXml::Modsxml.new(druid).load
+    end
+    def purlxml
+      @purlxml ||= DiscoveryIndexer::InputXml::Purlxml.new(druid).load
+    end
+  end
+end

data/lib/{logging.rb → discovery-indexer/logging.rb} RENAMED Viewed

File without changes

data/lib/{reader → discovery-indexer/reader}/modsxml.rb RENAMED Viewed

@@ -25,15 +25,6 @@ module DiscoveryIndexer
         modsxml_model = Stanford::Mods::Record.new
         modsxml_model.from_nk_node(@modsxml_ng_doc)
-        modsxml_model
-      end
-      # loads the mods xml to stanford mods model for the fedora object defind in the druid,
-      # it reads the mods xml from PURL server with every call
-      # @return [Stanford::Mods::Record] represents the mods xml
-      def reload
-        @modsxml_ng_doc = ModsxmlReader.read(@druid)
-        load
       end
     end
   end

data/lib/{reader → discovery-indexer/reader}/modsxml_reader.rb RENAMED Viewed

File without changes

data/lib/{reader → discovery-indexer/reader}/purlxml.rb RENAMED Viewed

@@ -19,17 +19,7 @@ module DiscoveryIndexer
       # @return [PurlxmlModel] represents the purlxml
       def load
         @purlxml_ng_doc = PurlxmlReader.read(@druid) if @purlxml_ng_doc.nil?
-        purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc)
-        purlxml_model = purlxml_parser.parse
-        purlxml_model
-      end
-      # loads the purl xml to purlxml model for the fedora object defind in the druid
-      # it reads the purl xml from PURL server with every call
-      # @return [PurlxmlModel] represents the purlxml
-      def reload
-        @purlxml_ng_doc = PurlxmlReader.read(@druid)
-        load
+        purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc).parse
       end
     end
   end

data/lib/{reader → discovery-indexer/reader}/purlxml_model.rb RENAMED Viewed

File without changes

data/lib/{reader → discovery-indexer/reader}/purlxml_parser_strict.rb RENAMED Viewed

@@ -1,12 +1,17 @@
 module DiscoveryIndexer
   module InputXml
-    class PurlxmlParserStrict < PurlxmlParser
+    class PurlxmlParserStrict
       include DiscoveryIndexer::Logging
       RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
       OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
       MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
+      def initialize(druid, purlxml_ng_doc)
+        @purlxml_ng_doc = purlxml_ng_doc
+        @druid = druid
+      end
       # it parses the purlxml into a purlxml model
       # @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
       def parse
@@ -117,12 +122,9 @@ module DiscoveryIndexer
         ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => 'info:fedora/fedora-system:def/relations-external#', '' => '' }
         is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
         # from public_xml rels-ext
-        druids = []
-        is_member_of_nodes.each do |n|
-          druids << n.value.split('druid:').last unless n.value.empty?
+        is_member_of_nodes.reject { |n| n.value.empty? }.map do |n|
+          n.value.split('druid:').last
         end
-        return nil if druids.empty?
-        druids
       end
       # the value of the type attribute for a DOR object's contentMetadata

data/lib/{reader → discovery-indexer/reader}/purlxml_reader.rb RENAMED Viewed

@@ -9,10 +9,8 @@ module DiscoveryIndexer
       # @raise [MissingPublicXml] if there's no purl xml available for this druid
       def self.read(druid)
         purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
         begin
-          purlxml_object = Nokogiri::XML(open(purlxml_uri))
-          return purlxml_object
+          Nokogiri::XML(open(purlxml_uri))
         rescue
           raise DiscoveryIndexer::Errors::MissingPurlPage.new(purlxml_uri)
         end

data/lib/{version.rb → discovery-indexer/version.rb} RENAMED Viewed

@@ -1,3 +1,3 @@
 module DiscoveryIndexer
-  VERSION = '0.10.1'
+  VERSION = '0.10.2'
 end

data/lib/discovery-indexer.rb CHANGED Viewed

@@ -1,19 +1,16 @@
-require 'errors'
-require 'logging'
+require 'discovery-indexer/errors'
+require 'discovery-indexer/logging'
-require 'reader/purlxml'
-require 'reader/purlxml_reader'
-require 'reader/purlxml_parser'
-require 'reader/purlxml_parser_strict'
-require 'reader/purlxml_model'
+require 'discovery-indexer/reader/purlxml'
+require 'discovery-indexer/reader/purlxml_reader'
+require 'discovery-indexer/reader/purlxml_parser_strict'
+require 'discovery-indexer/reader/purlxml_model'
-require 'reader/modsxml'
-require 'reader/modsxml_reader'
+require 'discovery-indexer/reader/modsxml'
+require 'discovery-indexer/reader/modsxml_reader'
-require 'mapper/general_mapper'
-require 'writer/solr_client'
-require 'writer/solr_writer'
+require 'discovery-indexer/general_mapper'
+require 'discovery-indexer/collection'
 # require 'utilities/extract_sub_targets'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: discovery-indexer
 version: !ruby/object:Gem::Version
-  version: 0.10.1
+  version: 0.10.2
 platform: ruby
 authors:
 - Ahmed AlSum
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-11-11 00:00:00.000000000 Z
+date: 2015-12-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -81,6 +81,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
@@ -145,19 +159,17 @@ extensions: []
 extra_rdoc_files: []
 files:
 - lib/discovery-indexer.rb
-- lib/errors.rb
-- lib/logging.rb
-- lib/mapper/general_mapper.rb
-- lib/reader/modsxml.rb
-- lib/reader/modsxml_reader.rb
-- lib/reader/purlxml.rb
-- lib/reader/purlxml_model.rb
-- lib/reader/purlxml_parser.rb
-- lib/reader/purlxml_parser_strict.rb
-- lib/reader/purlxml_reader.rb
-- lib/version.rb
-- lib/writer/solr_client.rb
-- lib/writer/solr_writer.rb
+- lib/discovery-indexer/collection.rb
+- lib/discovery-indexer/errors.rb
+- lib/discovery-indexer/general_mapper.rb
+- lib/discovery-indexer/logging.rb
+- lib/discovery-indexer/reader/modsxml.rb
+- lib/discovery-indexer/reader/modsxml_reader.rb
+- lib/discovery-indexer/reader/purlxml.rb
+- lib/discovery-indexer/reader/purlxml_model.rb
+- lib/discovery-indexer/reader/purlxml_parser_strict.rb
+- lib/discovery-indexer/reader/purlxml_reader.rb
+- lib/discovery-indexer/version.rb
 homepage:
 licenses:
 - Stanford University

data/lib/mapper/general_mapper.rb DELETED Viewed

@@ -1,27 +0,0 @@
-module DiscoveryIndexer
-  module Mapper
-    class GeneralMapper
-      # Initializes an instance from IndexMapper
-      # @param [String] druid e.g. ab123cd4567
-      # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
-      # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
-      # @param [Hash] collection_data represents a hash of collection_druid and catkey
-      # e.g. @collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
-      def initialize(druid, modsxml, purlxml, collection_data = {})
-        @druid = druid
-        @modsxml = modsxml
-        @purlxml = purlxml
-        @collection_data = collection_data
-      end
-      # Create a Hash representing a Solr doc, with all MODS related fields populated.
-      # @return [Hash] Hash representing the Solr document
-      def convert_to_solr_doc
-        solr_doc = {}
-        solr_doc[:id] = @druid
-        solr_doc[:title] = @modsxml.sw_full_title
-        solr_doc
-      end
-    end
-  end
-end

data/lib/reader/purlxml_parser.rb DELETED Viewed

@@ -1,13 +0,0 @@
-module DiscoveryIndexer
-  module InputXml
-    class PurlxmlParser
-      def initialize(druid, purlxml_ng_doc)
-        @purlxml_ng_doc = purlxml_ng_doc
-        @druid = druid
-      end
-      def parse
-      end
-    end
-  end
-end

data/lib/writer/solr_client.rb DELETED Viewed

@@ -1,113 +0,0 @@
-require 'retries'
-require 'rsolr'
-require 'rest-client'
-module DiscoveryIndexer
-  module Writer
-    # Processes adds and deletes to the solr core
-    class SolrClient
-      include DiscoveryIndexer::Logging
-      # Add the document to solr, retry if an error occurs.
-      # See https://github.com/ooyala/retries for docs on with_retries.
-      # @param id [String] the document id, usually it will be druid.
-      # @param solr_doc [Hash] a Hash representation of the solr document
-      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
-      # @param max_retries [Integer] the maximum number of tries before fail
-      def self.add(id, solr_doc, solr_connector, max_retries = 10)
-        process(id, solr_doc, solr_connector, max_retries, false)
-      end
-      # Add the document to solr, retry if an error occurs.
-      # See https://github.com/ooyala/retries for docs on with_retries.
-      # @param id [String] the document id, usually it will be druid.
-      # @param solr_connector[RSolr::Client]  is an open connection with the solr core
-      # @param max_retries [Integer] the maximum number of tries before fail
-      def self.delete(id, solr_connector, max_retries = 10)
-        process(id, {}, solr_connector, max_retries, true)
-      end
-      # It's an internal method that receives all the requests and deal with
-      # SOLR core. This method can call add, delete, or update
-      #
-      # @param id [String] the document id, usually it will be druid.
-      # @param solr_doc [Hash] is the solr doc in hash format
-      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
-      # @param max_retries [Integer] the maximum number of tries before fail
-      def self.process(id, solr_doc, solr_connector, max_retries, is_delete = false)
-        handler = proc do |exception, attempt_number, _total_delay|
-          DiscoveryIndexer::Logging.logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
-        end
-        with_retries(max_tries: max_retries, handler: handler, base_sleep_seconds: 1, max_sleep_seconds: 5) do |attempt|
-          DiscoveryIndexer::Logging.logger.debug "Attempt #{attempt} for #{id}"
-          if is_delete
-            DiscoveryIndexer::Logging.logger.info "Deleting #{id} on attempt #{attempt}"
-            solr_connector.delete_by_id(id, :add_attributes => {:commitWithin => 10000})
-          elsif allow_update?(solr_connector) && doc_exists?(id, solr_connector)
-            DiscoveryIndexer::Logging.logger.info "Updating #{id} on attempt #{attempt}"
-            update_solr_doc(id, solr_doc, solr_connector)
-          else
-            DiscoveryIndexer::Logging.logger.info "Indexing #{id} on attempt #{attempt}"
-            solr_connector.add(solr_doc, :add_attributes => {:commitWithin => 10000})
-          end
-          #solr_connector.commit
-          DiscoveryIndexer::Logging.logger.info "Completing #{id} successfully on attempt #{attempt}"
-        end
-      end
-      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
-      # @return [Boolean] true if the solr core allowing update feature
-      def self.allow_update?(solr_connector)
-        solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
-      end
-      # @param id [String] the document id, usually it will be druid.
-      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
-      # @return [Boolean] true if the solr doc defined by this id exists
-      def self.doc_exists?(id, solr_connector)
-        response = solr_connector.get 'select', params: { q: 'id:"' + id + '"' }
-        response['response']['numFound'] == 1
-      end
-      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
-      # send hard commit to solr
-      def self.commit(solr_connector)
-        RestClient.post self.solr_url(solr_connector), {},:content_type => :json, :accept=>:json
-      end
-      # It is an internal method that updates the solr doc instead of adding a new one.
-      # @param id [String] the document id, usually it will be druid.
-      # @param solr_doc [Hash] is the solr doc in hash format
-      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
-      def self.update_solr_doc(id, solr_doc, solr_connector)
-        # update_solr_doc can't used RSolr because updating hash doc is not supported
-        #  so we need to build the json input manually
-        params = "[{\"id\":\"#{id}\","
-        solr_doc.each do |field_name, new_values|
-          next if field_name == :id
-          params += "\"#{field_name}\":"
-          new_values = [new_values] unless new_values.class == Array
-          new_values = new_values.map { |s| s.to_s.gsub('\\', '\\\\\\').gsub('"', '\"').strip } # strip leading/trailing spaces and escape quotes for each value
-          params += "{\"set\":[\"#{new_values.join('","')}\"]},"
-        end
-        params.chomp!(',')
-        params += '}]'
-        RestClient.post self.solr_url(solr_connector), params, content_type: :json, accept: :json
-      end
-      # adjust the solr_url so it works with or without a trailing /
-      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
-      # @return [String] the solr URL
-      def self.solr_url(solr_connector)
-        solr_url = solr_connector.options[:url]
-        if solr_url.end_with?('/')
-          "#{solr_url}update?commit=true"
-        else
-          "#{solr_url}/update?commit=true"
-        end
-      end
-    end
-  end
-end

data/lib/writer/solr_writer.rb DELETED Viewed

@@ -1,54 +0,0 @@
-require 'retries'
-require 'rsolr'
-module DiscoveryIndexer
-  module Writer
-    # Performs writes to solr client based upon true and false release flags
-    class SolrWriter
-      attr_reader :solr_targets_configs
-      include DiscoveryIndexer::Logging
-      def process(id, index_doc, targets, targets_configs)
-        @solr_targets_configs = targets_configs
-        index_targets = targets.select { |_, b| b }.keys
-        delete_targets = targets.reject { |_, b| b }.keys
-        # get targets with true
-        solr_index_client(id, index_doc, index_targets) if index_targets.present?
-        # get targets with false
-        solr_delete_client(id, delete_targets) if delete_targets.present?
-      end
-      def solr_delete_from_all(id, targets_configs)
-        # Get a list of all registered targets
-        @solr_targets_configs = targets_configs
-        targets = solr_targets_configs.keys
-        solr_delete_client(id, targets)
-      end
-      def solr_index_client(id, index_doc, targets)
-        targets.each do |solr_target|
-          solr_connector = get_connector_for_target(solr_target)
-          SolrClient.add(id, index_doc, solr_connector) unless solr_connector.nil?
-        end
-      end
-      def solr_delete_client(id, targets)
-        targets.each do |solr_target|
-          solr_connector = get_connector_for_target(solr_target)
-          SolrClient.delete(id, solr_connector) unless solr_connector.nil?
-        end
-      end
-      def get_connector_for_target(solr_target)
-        solr_connector = nil
-        if solr_targets_configs.keys.include?(solr_target)
-          config = solr_targets_configs[solr_target]
-          solr_connector = RSolr.connect(config.deep_symbolize_keys)
-        end
-        solr_connector
-      end
-    end
-  end
-end