RubyGems - harvestdor-indexer - Versions diffs - 1.0.4 → 2.0.0 - Mend

harvestdor-indexer 1.0.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/.travis.yml +4 -2
data/Gemfile +1 -1
data/README.rdoc +1 -0
data/harvestdor-indexer.gemspec +4 -2
data/lib/harvestdor-indexer.rb +1 -317
data/lib/harvestdor/indexer.rb +159 -0
data/lib/harvestdor/indexer/metrics.rb +53 -0
data/lib/harvestdor/indexer/resource.rb +174 -0
data/lib/harvestdor/indexer/solr.rb +39 -0
data/lib/{harvestdor-indexer → harvestdor/indexer}/version.rb +1 -1
data/spec/config/ap.yml +32 -44
data/spec/fixtures/vcr_cassettes/get_collection_druids_call.yml +96 -0
data/spec/fixtures/vcr_cassettes/process_druids_whitelist_call.yml +1494 -16
data/spec/fixtures/vcr_cassettes/single_rsolr_connection_call.yml +80 -27
data/spec/spec_helper.rb +1 -1
data/spec/unit/harvestdor-indexer-resource_spec.rb +174 -0
data/spec/unit/harvestdor-indexer-solr_spec.rb +32 -0
data/spec/unit/harvestdor-indexer_spec.rb +47 -291
data/spec/unit/harvestdor/indexer/metrics_spec.rb +46 -0
metadata +45 -10
data/config/dor-fetcher-client.yml +0 -4
data/spec/config/ap_blacklist.txt +0 -5

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d9f1f173b34815b82678ad0919e7dea21b830297
-  data.tar.gz: e62f1eed6b4037528d2ad16c7df4a72a71d50d94
+  metadata.gz: 60218ff6d0f0210900b1b6a0df4d3a09a7122c34
+  data.tar.gz: 04ba680a7f7864dca78cbc9751c8eb9f6f9c811b
 SHA512:
-  metadata.gz: b3ee85a3b7d93200af455520f470152239ca185121bce1855bfd009b01f83b6135c29d2eccdcf8d6b0215637d675ccd6c6614d5380fbc003102eef0631bf9949
-  data.tar.gz: 34eec119b372e6590e266bb1a0ee87420de780f7f0b908d5570df0f9c58ab68c4e2363e15a525533a4bd56cce48b73d6369f5fd41cb34e8cbb31718456e02b83
+  metadata.gz: 2df842d3b19d9750f7a5e88f9af9dab60cfffa34cc80d7584b5b16f825aff64478e4c1ba3dde05de5749a9b8c17cbabd29fb55b4c71c965253b32d446686013b
+  data.tar.gz: 3c479f7be2c27ad39acad3316b77987dce8c487a654f47758513a4c4d094973ed7b0168f85307a5135de212d7469478e4e475285bd490250058b03869fea6619

data/.travis.yml CHANGED

@@ -1,10 +1,12 @@
 language: ruby
 script: rake rspec
 rvm:
-  - 2.1.1
+  - 2.2.0
+  - 2.1.5
   - 2.0.0
   - 1.9.3
-  - jruby-19mode # JRuby in 1.9 mode
+  - jruby-1.7.9-d19 # fails after 1.7.10 for some reason
+#  - jruby-19mode # JRuby in 1.9 mode
 notifications:
   email:
     - ndushay@stanford.edu

data/Gemfile CHANGED

@@ -1,4 +1,4 @@
 source 'https://rubygems.org'
 # See harvestdor-indexer.gemspec for this gem's dependencies
-gemspec
+gemspec

data/README.rdoc CHANGED

@@ -118,6 +118,7 @@ I suggest you run your code on harvestdor-dev, as it is already set up to be abl
 == Releases
+* <b>2.0.0</b> Complete refactor to update APIs, merge configuration yml files, update to rspec 3
 * <b>1.0.4</b> Set skip_heartbeat to true in the initialization of the DorFetcher::Client for ease of testing
 * <b>1.0.3</b> Implemented class level config so anything that inherits from Harvestdor::Indexer can share configuration settings
 * <b>1.0.0</b> Replaced OAI harvesting mechanism with dor-fetcher

data/harvestdor-indexer.gemspec CHANGED

@@ -1,7 +1,7 @@
 # -*- encoding: utf-8 -*-
 lib = File.expand_path('../lib', __FILE__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
-require 'harvestdor-indexer/version'
+require 'harvestdor/indexer/version'
 Gem::Specification.new do |gem|
   gem.name          = "harvestdor-indexer"
@@ -22,6 +22,8 @@ Gem::Specification.new do |gem|
   gem.add_dependency 'harvestdor', '>=0.0.14'
   gem.add_dependency 'stanford-mods'
   gem.add_dependency 'dor-fetcher', '=1.0.5'
+  gem.add_dependency "activesupport"
+  gem.add_dependency "parallel"
   # Runtime dependencies
   gem.add_runtime_dependency 'confstruct'
@@ -34,7 +36,7 @@ Gem::Specification.new do |gem|
   gem.add_development_dependency "rdoc"
   gem.add_development_dependency "yard"
   # tests
-	gem.add_development_dependency 'rspec'
+	gem.add_development_dependency 'rspec', "~> 3.0"
 	gem.add_development_dependency 'coveralls'
 	# gem.add_development_dependency 'ruby-debug19'
   gem.add_development_dependency 'vcr'

data/lib/harvestdor-indexer.rb CHANGED

@@ -1,317 +1 @@
-# external gems
-require 'confstruct'
-require 'rsolr'
-require 'retries'
-require 'json'
-# sul-dlss gems
-require 'harvestdor'
-require 'stanford-mods'
-require 'dor-fetcher'
-# stdlib
-require 'logger'
-require "harvestdor-indexer/version"
-module Harvestdor
-  # Base class to harvest from DOR via harvestdor gem and then index
-  class Indexer
-    attr_accessor :error_count, :success_count, :max_retries
-    attr_accessor :total_time_to_parse,:total_time_to_solr
-    attr_accessor :dor_fetcher_client, :client_config
-    # Class level config variable
-    @@config ||= Confstruct::Configuration.new()
-    def initialize yml_path, client_config_path, options = {}
-      @success_count=0    # the number of objects successfully indexed
-      @error_count=0      # the number of objects that failed
-      @max_retries=10      # the number of times to retry an object
-      @total_time_to_solr=0
-      @total_time_to_parse=0
-      @yml_path = yml_path
-      config.configure(YAML.load_file(yml_path)) if yml_path
-      config.configure options
-      yield(config) if block_given?
-      @client_config = YAML.load_file(client_config_path) if client_config_path && File.exists?(client_config_path)
-      # Adding skip_heartbeat param for easier testing
-      @dor_fetcher_client=DorFetcher::Client.new({:service_url => client_config["dor_fetcher_service_url"], :skip_heartbeat => true})
-    end
-    # to allow class level access to config variables for record_merger and solr_doc_builder
-    #  (rather than passing a lot of params to constructor)
-    def self.config
-      @@config ||= Confstruct::Configuration.new()
-    end
-    def config
-      Indexer.config
-    end
-    def logger
-      @logger ||= load_logger(config.log_dir, config.log_name)
-    end
-    # per this Indexer's config options
-    #  harvest the druids via DorFetcher
-    #   create a Solr profiling document for each druid
-    #   write the result to the Solr index
-    def harvest_and_index
-      start_time=Time.now
-      logger.info("Started harvest_and_index at #{start_time}")
-      if whitelist.empty?
-        druids.each { |druid| index druid }
-      else
-        whitelist.each { |druid| index druid }
-      end
-      solr_client.commit
-      total_time=elapsed_time(start_time)
-      total_objects=@success_count+@error_count
-      logger.info("Finished harvest_and_index at #{Time.now}: final Solr commit returned")
-      logger.info("Total elapsed time for harvest and index: #{(total_time/60.0).round(2)} minutes")
-      logger.info("Avg solr commit time per object (successful): #{(@total_time_to_solr/@success_count).round(2)} seconds") unless (@total_time_to_solr == 0 || @success_count == 0)
-      logger.info("Avg solr commit time per object (all): #{(@total_time_to_solr/total_objects).round(2)} seconds") unless (@total_time_to_solr == 0 || @error_count == 0 || total_objects == 0)
-      logger.info("Avg parse time per object (successful): #{(@total_time_to_parse/@success_count).round(2)} seconds") unless (@total_time_to_parse == 0 || @success_count == 0)
-      logger.info("Avg parse time per object (all): #{(@total_time_to_parse/total_objects).round(2)} seconds") unless (@total_time_to_parse == 0 || @error_count == 0 || total_objects == 0)
-      logger.info("Avg complete index time per object (successful): #{(total_time/@success_count).round(2)} seconds") unless (@success_count == 0)
-      logger.info("Avg complete index time per object (all): #{(total_time/total_objects).round(2)} seconds") unless (@error_count == 0 || total_objects == 0)
-      logger.info("Successful count: #{@success_count}")
-      logger.info("Error count: #{@error_count}")
-      logger.info("Total records processed: #{total_objects}")
-    end
-    # return Array of druids contained in the DorFetcher pulling indicated by DorFetcher params
-    # @return [Array<String>] or enumeration over it, if block is given.  (strings are druids, e.g. ab123cd1234)
-    def druids
-      if @druids.nil?
-        start_time=Time.now
-        logger.info("Starting DorFetcher pulling of druids at #{start_time}.")
-        @druids = @dor_fetcher_client.druid_array(@dor_fetcher_client.get_collection(strip_default_set_string(), {}))
-        logger.info("Completed DorFetcher pulling of druids at #{Time.now}.  Found #{@druids.size} druids.  Total elapsed time for DorFetcher pulling = #{elapsed_time(start_time,:minutes)} minutes")
-      end
-      return @druids
-    end
-    # Add the document to solr, retry if an error occurs.
-    # See https://github.com/ooyala/retries for docs on with_retries.
-    # @param [Hash] doc a Hash representation of the solr document
-    # @param [String] id the id of the document being sent, for logging
-    def solr_add(doc, id)
-      max_tries=@max_retries ? @max_retries : 10 #if @max_retries isn't set, use 10
-      handler = Proc.new do |exception, attempt_number, total_delay|
-        logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
-        # logger.debug exception.backtrace
-      end
-      with_retries(:max_tries => max_tries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
-        logger.debug "Attempt #{attempt} for #{id}"
-        solr_client.add(doc)
-        logger.info "Successfully indexed #{id} on attempt #{attempt}"
-      end
-    end
-    # create Solr doc for the druid and add it to Solr, unless it is on the blacklist.
-    #  NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
-    def index druid
-      if blacklist.include?(druid)
-        logger.info("Druid #{druid} is on the blacklist and will have no Solr doc created")
-      else
-        logger.fatal("You must override the index method to transform druids into Solr docs and add them to Solr")
-        begin
-          start_time=Time.now
-          logger.info("About to index #{druid} at #{start_time}")
-          #logger.debug "About to index #{druid}"
-          doc_hash = {}
-          doc_hash[:id] = druid
-          # doc_hash[:title_tsim] = smods_rec(druid).short_title
-          # you might add things from Indexer level class here
-          #  (e.g. things that are the same across all documents in the harvest)
-          solr_client.add(doc_hash)
-          logger.info("Indexed #{druid} in #{elapsed_time(start_time)} seconds")
-          @success_count+=1
-          # TODO: provide call to code to update DOR object's workflow datastream??
-        rescue => e
-          @error_count+=1
-          logger.error "Failed to index #{druid} in #{elapsed_time(start_time)} seconds: #{e.message}"
-        end
-      end
-    end
-    # return the MODS for the druid as a Stanford::Mods::Record object
-    # @param [String] druid e.g. ab123cd4567
-    # @return [Stanford::Mods::Record] created from the MODS xml for the druid
-    def smods_rec druid
-      start_time=Time.now
-      ng_doc = harvestdor_client.mods druid
-      logger.info("Fetched MODs for #{druid} in #{elapsed_time(start_time)} seconds")
-      raise "Empty MODS metadata for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
-      mods_rec = Stanford::Mods::Record.new
-      mods_rec.from_nk_node(ng_doc.root)
-      mods_rec
-    end
-    # the public xml for this DOR object, from the purl page
-    # @param [String] druid e.g. ab123cd4567
-    # @return [Nokogiri::XML::Document] the public xml for the DOR object
-    def public_xml druid
-      start_time=Time.now
-      ng_doc = harvestdor_client.public_xml druid
-      logger.info("Fetched public_xml for #{druid} in #{elapsed_time(start_time)} seconds")
-      raise "No public xml for #{druid}" if !ng_doc
-      raise "Empty public xml for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
-      ng_doc
-    end
-    # the contentMetadata for this DOR object, ultimately from the purl public xml
-    # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
-    #  a Nokogiri::XML::Document containing the public_xml for an object
-    # @return [Nokogiri::XML::Document] the contentMetadata for the DOR object
-    def content_metadata object
-      start_time=Time.now
-      ng_doc = harvestdor_client.content_metadata object
-      logger.info("Fetched content_metadata in #{elapsed_time(start_time)} seconds")
-      raise "No contentMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
-      ng_doc
-    end
-    # the identityMetadata for this DOR object, ultimately from the purl public xml
-    # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
-    #  a Nokogiri::XML::Document containing the public_xml for an object
-    # @return [Nokogiri::XML::Document] the identityMetadata for the DOR object
-    def identity_metadata object
-      start_time=Time.now
-      ng_doc = harvestdor_client.identity_metadata object
-      logger.info("Fetched identity_metadata in #{elapsed_time(start_time)} seconds")
-      raise "No identityMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
-      ng_doc
-    end
-    # the rightsMetadata for this DOR object, ultimately from the purl public xml
-    # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
-    #  a Nokogiri::XML::Document containing the public_xml for an object
-    # @return [Nokogiri::XML::Document] the rightsMetadata for the DOR object
-    def rights_metadata object
-      start_time=Time.now
-      ng_doc = harvestdor_client.rights_metadata object
-      logger.info("Fetched rights_metadata in #{elapsed_time(start_time)} seconds")
-      raise "No rightsMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
-      ng_doc
-    end
-    # the RDF for this DOR object, ultimately from the purl public xml
-    # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
-    #  a Nokogiri::XML::Document containing the public_xml for an object
-    # @return [Nokogiri::XML::Document] the RDF for the DOR object
-    def rdf object
-      start_time=Time.now
-      ng_doc = harvestdor_client.rdf object
-      logger.info("Fetched rdf in #{elapsed_time(start_time)} seconds")
-      raise "No RDF for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
-      ng_doc
-    end
-    def solr_client
-      @solr_client ||= RSolr.connect(config.solr.to_hash)
-    end
-    # @return an Array of druids ('oo000oo0000') that should NOT be processed
-    def blacklist
-      # avoid trying to load the file multiple times
-      if !@blacklist && !@loaded_blacklist
-        @blacklist = load_blacklist(config.blacklist) if config.blacklist
-      end
-      @blacklist ||= []
-    end
-    # @return an Array of druids ('oo000oo0000') that should be processed
-    def whitelist
-      # avoid trying to load the file multiple times
-      if !@whitelist && !@loaded_whitelist
-        @whitelist = load_whitelist(config.whitelist) if config.whitelist
-      end
-      @whitelist ||= []
-    end
-    # Get only the druid from the end of the default_set string
-    # from the yml file
-    def strip_default_set_string()
-      Indexer.config.default_set.split('_').last
-    end
-    protected #---------------------------------------------------------------------
-    def harvestdor_client
-      @harvestdor_client ||= Harvestdor::Client.new({:config_yml_path => @yml_path})
-    end
-    def elapsed_time(start_time,units=:seconds)
-      elapsed_seconds=Time.now-start_time
-      case units
-      when :seconds
-        return elapsed_seconds.round(2)
-      when :minutes
-        return (elapsed_seconds/60.0).round(1)
-      when :hours
-        return (elapsed_seconds/3600.0).round(2)
-      else
-        return elapsed_seconds
-      end
-    end
-    # populate @blacklist as an Array of druids ('oo000oo0000') that will NOT be processed
-    #  by reading the File at the indicated path
-    # @param [String] path - path of file containing a list of druids
-    def load_blacklist path
-      if path && !@loaded_blacklist
-        @loaded_blacklist = true
-        @blacklist = load_id_list path
-      end
-    end
-    # populate @blacklist as an Array of druids ('oo000oo0000') that WILL be processed
-    #  (unless a druid is also on the blacklist)
-    #  by reading the File at the indicated path
-    # @param [String] path - path of file containing a list of druids
-    def load_whitelist path
-      if path && !@loaded_whitelist
-        @loaded_whitelist = true
-        @whitelist = load_id_list path
-      end
-    end
-    # return an Array of druids ('oo000oo0000')
-    #   populated by reading the File at the indicated path
-    # @param [String] path - path of file containing a list of druids
-    # @return [Array<String>] an Array of druids
-    def load_id_list path
-      if path
-        list = []
-        f = File.open(path).each_line { |line|
-          list << line.gsub(/\s+/, '') if !line.gsub(/\s+/, '').empty? && !line.strip.start_with?('#')
-        }
-        list
-      end
-    rescue
-      msg = "Unable to find list of druids at " + path
-      logger.fatal msg
-      raise msg
-    end
-    # Global, memoized, lazy initialized instance of a logger
-    # @param [String] log_dir directory for to get log file
-    # @param [String] log_name name of log file
-    def load_logger(log_dir, log_name)
-      Dir.mkdir(log_dir) unless File.directory?(log_dir)
-      @logger ||= Logger.new(File.join(log_dir, log_name), 'daily')
-    end
-  end # Indexer class
-end # Harvestdor module
+require 'harvestdor/indexer'

data/lib/harvestdor/indexer.rb ADDED

@@ -0,0 +1,159 @@
+# external gems
+require 'confstruct'
+require 'rsolr'
+require 'retries'
+require 'parallel'
+require 'json'
+# sul-dlss gems
+require 'harvestdor'
+require 'stanford-mods'
+require 'dor-fetcher'
+# stdlib
+require 'logger'
+require "harvestdor/indexer/version"
+require 'active_support/benchmarkable'
+module Harvestdor
+  # Base class to harvest from DOR via harvestdor gem and then index
+  class Indexer
+    require "harvestdor/indexer/metrics"
+    require "harvestdor/indexer/resource"
+    require "harvestdor/indexer/solr"
+    include ActiveSupport::Benchmarkable
+    attr_accessor :metrics, :logger
+    def initialize options = {}
+      config.configure(options)
+      yield(config) if block_given?
+      @metrics = Harvestdor::Indexer::Metrics.new logger: logger
+    end
+    def config
+      @config ||= Confstruct::Configuration.new
+    end
+    def logger
+      @logger ||= begin
+        if config.harvestdor
+          Dir.mkdir(config.harvestdor.log_dir) unless File.directory?(config.harvestdor.log_dir)
+          Logger.new(File.join(config.harvestdor.log_dir, config.harvestdor.log_name), 'daily')
+        else
+          Logger.new STDERR
+        end
+      end
+    end
+    # per this Indexer's config options
+    #  harvest the druids via DorFetcher
+    #   create a Solr profiling document for each druid
+    #   write the result to the Solr index
+    def harvest_and_index each_options = {in_threads: 4}
+      benchmark "Harvest and Indexing" do
+        each_resource(each_options) do |resource|
+          index resource
+        end
+        solr.commit!
+      end
+    end
+    def resources
+      druids.map do |x|
+        Harvestdor::Indexer::Resource.new(self, x)
+      end.map do |x|
+        [x, (x.items if x.collection?)]
+      end.flatten.uniq.compact
+    end
+    def each_resource options = {}, &block
+      benchmark "" do
+        Parallel.each(resources, options) do |resource|
+          metrics.tally on_error: method(:resource_error) do
+            yield resource
+          end
+        end
+      end
+      logger.info("Successful count: #{metrics.success_count}")
+      logger.info("Error count: #{metrics.error_count}")
+      logger.info("Total records processed: #{metrics.total}")
+    end
+    def resource_error e
+      if e.instance_of? Parallel::Break or e.instance_of? Parallel::Kill
+        raise e
+      end
+    end
+    # return Array of druids contained in the DorFetcher pulling indicated by DorFetcher params
+    # @return [Array<String>] or enumeration over it, if block is given.  (strings are druids, e.g. ab123cd1234)
+    def druids
+      @druids ||= whitelist
+    end
+    # create Solr doc for the druid and add it to Solr
+    #  NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
+    def index resource
+      benchmark "Indexing #{resource.druid}" do
+        logger.debug "About to index #{resource.druid}"
+        doc_hash = {}
+        doc_hash[:id] = resource.druid
+        # you might add things from Indexer level class here
+        #  (e.g. things that are the same across all documents in the harvest)
+        solr.add doc_hash
+        # TODO: provide call to code to update DOR object's workflow datastream??
+      end
+    end
+    # @return an Array of druids ('oo000oo0000') that should be processed
+    def whitelist
+      @whitelist ||= config.whitelist if config.whitelist.is_a? Array
+      @whitelist ||= load_whitelist(config.whitelist) if config.whitelist
+      @whitelist ||= []
+    end
+    def harvestdor_client
+      @harvestdor_client ||= Harvestdor::Client.new(config.harvestdor)
+    end
+    def dor_fetcher_client
+      @dor_fetcher_client ||= DorFetcher::Client.new(config.dor_fetcher)
+    end
+    def solr
+      @solr ||= Harvestdor::Indexer::Solr.new self, config.solr.to_hash
+    end
+    protected #---------------------------------------------------------------------
+    # populate @whitelist as an Array of druids ('oo000oo0000') that WILL be processed
+    #  by reading the File at the indicated path
+    # @param [String] path - path of file containing a list of druids
+    def load_whitelist path
+      @whitelist = load_id_list path
+    end
+    # return an Array of druids ('oo000oo0000')
+    #   populated by reading the File at the indicated path
+    # @param [String] path - path of file containing a list of druids
+    # @return [Array<String>] an Array of druids
+    def load_id_list path
+      list = File.open(path).each_line
+              .map { |line| line.strip }
+              .reject { |line| line.strip.start_with?('#') }
+              .reject { |line| line.empty? }
+    rescue
+      msg = "Unable to find list of druids at " + path
+      logger.fatal msg
+      raise msg
+    end
+  end # Indexer class
+end # Harvestdor module