RubyGems - base_indexer - Versions diffs - 0.6.1 → 0.6.2 - Mend

base_indexer 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/config/initializers/base_indexer.rb +1 -1
data/lib/base_indexer/{solr → config}/solr_configuration.rb +0 -0
data/lib/base_indexer/{solr → config}/solr_configuration_from_file.rb +0 -0
data/lib/base_indexer/main_indexer_engine.rb +2 -66
data/lib/base_indexer/solr/client.rb +113 -0
data/lib/base_indexer/solr/writer.rb +54 -0
data/lib/base_indexer/version.rb +1 -1
data/lib/base_indexer.rb +4 -3
metadata +6 -6
data/README.rdoc +0 -93
data/lib/base_indexer/collection.rb +0 -46

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: cfc68bc2f274b9751be97b344ad79f82fce07bf8
-  data.tar.gz: be150decf5e3f61db3d38604e33908c79b72f5be
+  metadata.gz: 1fd0a69da41b249eeebbea882c38c0de1fe0bd70
+  data.tar.gz: 53d6ed530f7463518f0c7621ea50354bf9edb185
 SHA512:
-  metadata.gz: 5f740995b696b60d857f82aefeece318b52da8ef1f1014858a7bbf28b936060bf1518db44a66405c93f668bbde09175f1eb3d3d7bca9041abb38a24d8122c2a6
-  data.tar.gz: 3eb96fa4ed705891253287c539f870fce5cfd9a57d6581f53cdcec1d50ec631753fc35b093921a5b14bf99fceb1cfd8c73eef3c5047adcd6bcd577b35298953a
+  metadata.gz: 36315c92a7aa60a2b2d3414cc2162771e3eadcfb1c765560ae4756f1256407d05c43f51dd3957f593238b8645acfdb2ea4e9f4e7f998160b098deb513c86a347
+  data.tar.gz: 0bc53b7fa053c1b1a4c4c446451e9741cf4dcf4bb72f0e6d071ca2de241827ac0f7a1c4393d16a0df4e137241c081f64b86d63e4dcc4ce5bee100a3707d3c06a

data/config/initializers/base_indexer.rb CHANGED Viewed

@@ -5,4 +5,4 @@ BaseIndexer.indexer_class = 'BaseIndexer::MainIndexerEngine'
 BaseIndexer.solr_configuration_class_name = 'BaseIndexer::SolrConfigurationFromFile'
 # BaseIndexer.solr_configuration_class.constantize.new(Rails.configuration.solr_config_file_path)
 BaseIndexer.mapper_class_name = 'DiscoveryIndexer::Mapper::GeneralMapper'
-BaseIndexer.solr_writer_class_name = 'DiscoveryIndexer::Writer::SolrWriter'
+BaseIndexer.solr_writer_class_name = 'BaseIndexer::Solr::Writer'

data/lib/base_indexer/{solr → config}/solr_configuration.rb RENAMED Viewed

File without changes

data/lib/base_indexer/{solr → config}/solr_configuration_from_file.rb RENAMED Viewed

File without changes

data/lib/base_indexer/main_indexer_engine.rb CHANGED Viewed

@@ -26,27 +26,12 @@ module BaseIndexer
     #
     # @raise it will raise erros if there is any problems happen in any level
     def index(druid, targets = nil)
-      # Read input mods and purl
-      purl_model =  read_purl(druid)
-      mods_model =  read_mods(druid)
-      collection_data = collection_data(purl_model.collection_druids)
       # Map the input to solr_doc
-      solr_doc = BaseIndexer.mapper_class_name.constantize.new(druid, mods_model, purl_model, collection_data).convert_to_solr_doc
-      # Get target list
-      targets_hash = {}
-      if targets.present?
-        targets_hash = targets_hash_from_param(targets)
-      else
-        targets_hash = purl_model.release_tags_hash
-      end
-      targets_hash = update_targets_before_write(targets_hash, purl_model)
+      solr_doc = BaseIndexer.mapper_class_name.constantize.new(druid).convert_to_solr_doc
       # Get SOLR configuration and write
       solr_targets_configs = BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash
-      BaseIndexer.solr_writer_class_name.constantize.new.process(druid, solr_doc, targets_hash, solr_targets_configs)
+      BaseIndexer.solr_writer_class_name.constantize.new.process(druid, solr_doc, targets, solr_targets_configs)
     end
     # It deletes an item defined by druid from all registered solr core
@@ -56,54 +41,5 @@ module BaseIndexer
       BaseIndexer.solr_writer_class_name.constantize.new.solr_delete_from_all(druid, solr_targets_configs)
     end
-    def read_purl(druid)
-      DiscoveryIndexer::InputXml::Purlxml.new(druid).load
-    end
-    def read_mods(druid)
-      DiscoveryIndexer::InputXml::Modsxml.new(druid).load
-    end
-    # It converts targets array to targets hash
-    # @param targets [Array] a  list of specfic targets
-    # @return [Hash] a hash of targets with true value
-    # @example convert target list
-    #   targets_hash_from_param( ["searchworks","revs"] )
-    #   {"searchworks"=>true, "revs"=>true}
-    def targets_hash_from_param(targets)
-      targets_hash = {}
-      unless targets.nil?
-        targets.each do |target|
-          targets_hash[target] = true
-        end
-      end
-      targets_hash
-    end
-    # It allows the consumer to modify the targets list before doing the final writing
-    #  to the solr core. Default behavior returns the targets_hash as it is
-    # @param targets_hash [Hash] a hash of targets with true value
-    # @param purl_model [DiscoveryIndexer::Reader::PurlxmlModel]  represents the purlxml model
-    # @return [Hash] a hash of targets
-    def update_targets_before_write(targets_hash, _purl_model)
-      targets_hash
-    end
-    # It converts collection_druids list to a hash with names. If the druid doesn't
-    # have a collection name, it will be excluded from the hash
-    # @param collection_druids [Array] a list of druids
-    #   !["ab123cd4567", "ef123gh4567"]
-    # @return [Hash] a hash for collection druid and its name
-    #   !{"ab123cd4567"=>"Collection 1", "ef123gh4567"=>"Collection 2"}
-    def collection_data(collection_druids)
-      collection_data = {}
-      unless collection_druids.nil?
-        collection_druids.each do |cdruid|
-          cdata = BaseIndexer::Collection.new(cdruid).collection_info
-          collection_data[cdruid] = cdata if cdata.present?
-        end
-      end
-      collection_data
-    end
   end
 end

data/lib/base_indexer/solr/client.rb ADDED Viewed

@@ -0,0 +1,113 @@
+require 'retries'
+require 'rsolr'
+require 'rest-client'
+module BaseIndexer
+  module Solr
+    # Processes adds and deletes to the solr core
+    class Client
+      include DiscoveryIndexer::Logging
+      # Add the document to solr, retry if an error occurs.
+      # See https://github.com/ooyala/retries for docs on with_retries.
+      # @param id [String] the document id, usually it will be druid.
+      # @param solr_doc [Hash] a Hash representation of the solr document
+      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
+      # @param max_retries [Integer] the maximum number of tries before fail
+      def self.add(id, solr_doc, solr_connector, max_retries = 10)
+        process(id, solr_doc, solr_connector, max_retries, false)
+      end
+      # Add the document to solr, retry if an error occurs.
+      # See https://github.com/ooyala/retries for docs on with_retries.
+      # @param id [String] the document id, usually it will be druid.
+      # @param solr_connector[RSolr::Client]  is an open connection with the solr core
+      # @param max_retries [Integer] the maximum number of tries before fail
+      def self.delete(id, solr_connector, max_retries = 10)
+        process(id, {}, solr_connector, max_retries, true)
+      end
+      # It's an internal method that receives all the requests and deal with
+      # SOLR core. This method can call add, delete, or update
+      #
+      # @param id [String] the document id, usually it will be druid.
+      # @param solr_doc [Hash] is the solr doc in hash format
+      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
+      # @param max_retries [Integer] the maximum number of tries before fail
+      def self.process(id, solr_doc, solr_connector, max_retries, is_delete = false)
+        handler = proc do |exception, attempt_number, _total_delay|
+          DiscoveryIndexer::Logging.logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
+        end
+        with_retries(max_tries: max_retries, handler: handler, base_sleep_seconds: 1, max_sleep_seconds: 5) do |attempt|
+          DiscoveryIndexer::Logging.logger.debug "Attempt #{attempt} for #{id}"
+          if is_delete
+            DiscoveryIndexer::Logging.logger.info "Deleting #{id} on attempt #{attempt}"
+            solr_connector.delete_by_id(id, :add_attributes => {:commitWithin => 10000})
+          elsif allow_update?(solr_connector) && doc_exists?(id, solr_connector)
+            DiscoveryIndexer::Logging.logger.info "Updating #{id} on attempt #{attempt}"
+            update_solr_doc(id, solr_doc, solr_connector)
+          else
+            DiscoveryIndexer::Logging.logger.info "Indexing #{id} on attempt #{attempt}"
+            solr_connector.add(solr_doc, :add_attributes => {:commitWithin => 10000})
+          end
+          #solr_connector.commit
+          DiscoveryIndexer::Logging.logger.info "Completing #{id} successfully on attempt #{attempt}"
+        end
+      end
+      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
+      # @return [Boolean] true if the solr core allowing update feature
+      def self.allow_update?(solr_connector)
+        solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
+      end
+      # @param id [String] the document id, usually it will be druid.
+      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
+      # @return [Boolean] true if the solr doc defined by this id exists
+      def self.doc_exists?(id, solr_connector)
+        response = solr_connector.get 'select', params: { q: 'id:"' + id + '"' }
+        response['response']['numFound'] == 1
+      end
+      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
+      # send hard commit to solr
+      def self.commit(solr_connector)
+        RestClient.post self.solr_url(solr_connector), {},:content_type => :json, :accept=>:json
+      end
+      # It is an internal method that updates the solr doc instead of adding a new one.
+      # @param id [String] the document id, usually it will be druid.
+      # @param solr_doc [Hash] is the solr doc in hash format
+      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
+      def self.update_solr_doc(id, solr_doc, solr_connector)
+        # update_solr_doc can't used RSolr because updating hash doc is not supported
+        #  so we need to build the json input manually
+        params = "[{\"id\":\"#{id}\","
+        solr_doc.each do |field_name, new_values|
+          next if field_name == :id
+          params += "\"#{field_name}\":"
+          new_values = [new_values] unless new_values.class == Array
+          new_values = new_values.map { |s| s.to_s.gsub('\\', '\\\\\\').gsub('"', '\"').strip } # strip leading/trailing spaces and escape quotes for each value
+          params += "{\"set\":[\"#{new_values.join('","')}\"]},"
+        end
+        params.chomp!(',')
+        params += '}]'
+        RestClient.post self.solr_url(solr_connector), params, content_type: :json, accept: :json
+      end
+      # adjust the solr_url so it works with or without a trailing /
+      # @param solr_connector [RSolr::Client]  is an open connection with the solr core
+      # @return [String] the solr URL
+      def self.solr_url(solr_connector)
+        solr_url = solr_connector.options[:url]
+        if solr_url.end_with?('/')
+          "#{solr_url}update?commit=true"
+        else
+          "#{solr_url}/update?commit=true"
+        end
+      end
+    end
+  end
+end

data/lib/base_indexer/solr/writer.rb ADDED Viewed

@@ -0,0 +1,54 @@
+require 'retries'
+require 'rsolr'
+module BaseIndexer
+  module Solr
+    # Performs writes to solr client based upon true and false release flags
+    class Writer
+      attr_reader :solr_targets_configs
+      include DiscoveryIndexer::Logging
+      def process(id, index_doc, targets, targets_configs)
+        @solr_targets_configs = targets_configs
+        index_targets = targets.select { |_, b| b }.keys
+        delete_targets = targets.reject { |_, b| b }.keys
+        # get targets with true
+        solr_index_client(id, index_doc, index_targets) if index_targets.present?
+        # get targets with false
+        solr_delete_client(id, delete_targets) if delete_targets.present?
+      end
+      def solr_delete_from_all(id, targets_configs)
+        # Get a list of all registered targets
+        @solr_targets_configs = targets_configs
+        targets = solr_targets_configs.keys
+        solr_delete_client(id, targets)
+      end
+      def solr_index_client(id, index_doc, targets)
+        targets.each do |solr_target|
+          solr_connector = get_connector_for_target(solr_target)
+          Client.add(id, index_doc, solr_connector) unless solr_connector.nil?
+        end
+      end
+      def solr_delete_client(id, targets)
+        targets.each do |solr_target|
+          solr_connector = get_connector_for_target(solr_target)
+          Client.delete(id, solr_connector) unless solr_connector.nil?
+        end
+      end
+      def get_connector_for_target(solr_target)
+        solr_connector = nil
+        if solr_targets_configs.keys.include?(solr_target)
+          config = solr_targets_configs[solr_target]
+          solr_connector = RSolr.connect(config.deep_symbolize_keys)
+        end
+        solr_connector
+      end
+    end
+  end
+end

data/lib/base_indexer/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module BaseIndexer
-  VERSION = '0.6.1'
+  VERSION = '0.6.2'
 end

data/lib/base_indexer.rb CHANGED Viewed

@@ -1,9 +1,10 @@
 require 'base_indexer/engine'
 require 'base_indexer/main_indexer_engine'
-require 'base_indexer/solr/solr_configuration'
-require 'base_indexer/solr/solr_configuration_from_file'
-require 'base_indexer/collection'
+require 'base_indexer/config/solr_configuration'
+require 'base_indexer/config/solr_configuration_from_file'
+require 'base_indexer/solr/client'
+require 'base_indexer/solr/writer'
 require 'discovery-indexer'
 module BaseIndexer
   mattr_accessor :indexer_class

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: base_indexer
 version: !ruby/object:Gem::Version
-  version: 0.6.1
+  version: 0.6.2
 platform: ruby
 authors:
 - Ahmed Alsum
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-11-11 00:00:00.000000000 Z
+date: 2015-12-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rails
@@ -220,7 +220,6 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- README.rdoc
 - Rakefile
 - app/controllers/base_indexer/about_controller.rb
 - app/controllers/base_indexer/application_controller.rb
@@ -235,11 +234,12 @@ files:
 - config/initializers/is_it_working.rb
 - config/routes.rb
 - lib/base_indexer.rb
-- lib/base_indexer/collection.rb
+- lib/base_indexer/config/solr_configuration.rb
+- lib/base_indexer/config/solr_configuration_from_file.rb
 - lib/base_indexer/engine.rb
 - lib/base_indexer/main_indexer_engine.rb
-- lib/base_indexer/solr/solr_configuration.rb
-- lib/base_indexer/solr/solr_configuration_from_file.rb
+- lib/base_indexer/solr/client.rb
+- lib/base_indexer/solr/writer.rb
 - lib/base_indexer/version.rb
 - lib/generators/base_indexer/install_generator.rb
 - lib/generators/base_indexer/templates/solr.yml

data/README.rdoc DELETED Viewed

@@ -1,93 +0,0 @@
-{<img src="https://travis-ci.org/sul-dlss/base_indexer.svg?branch=master" alt="Build Status" />}[https://travis-ci.org/sul-dlss/base_indexer]  {<img src="https://coveralls.io/repos/sul-dlss/base_indexer/badge.svg" alt="Coverage Status" />}[https://coveralls.io/r/sul-dlss/base_indexer]
-= BaseIndexer
-This project rocks and uses MIT-LICENSE.
-== Running tests
-Clone from github.
-  rake # first time setup and to generate all docs
-  bundle exec rake spec # just run the tests next time around
-== Steps to hook the base_indexer engine in your app
-* Generate new rails app
-rails new my_indexer_app
-*Edit Gemfile and add the base_indexer gem name
-gem 'base_indexer'
-* Run bundle install to download the gem
-bundle install
-* Mount the engine in your favorite domain.
-mount BaseIndexer::Engine, at: '/items'
-== Basic configuration
-The engine is looking for the following values
-config.solr_config_file_path = "#{config.root}/config/solr.yml"
-DiscoveryIndexer::PURL_DEFAULT='https://purl.stanford.edu'
-== Advanced features
-The engine gives the developer the ability to extend any of its classes
-To extend any of indexer features (purl-reader, mods-reader, mapper, solr-writer)
-1. Create a new class that inherits from BaseIndexer::MainIndexerEngine
-2. Create a new file named config/initializers/base_indexer.rb
-3. In this file, add the following line. replace 'MyIndexerClassName' with the fully qualifed actual class name. The name should be between double qoutes
-BaseIndexer.indexer_class = "MyIndexerClassName"
-4. In the new indexer class, you can override any of the functions that you need to change its implementation. For example, if you need to use a new mapper, you will override map function.
-To extend mapper functionality.
-1. Create a new class e.g., MyMapper that inherits from GeneralMapper or IndexMapper.
-2. Implement MyMapper.map to converts the input to solr doc hash.
-3. Override MyIndexerClassName.map to call your new class instead of the default one.
-== Rake Tasks For Indexing Druids
-All rake tasks that perform batch indexing will generate log files in the "log" folder within the app itself.  You can tail the log file to watch the progress.  The
-log file is also useful since you can pass it to the "reindexer" rake task to retry just the errored out druids.  The name of the log file will depend on which
-rake task you are running, and will be timestamped to be unique.
-==== Index a single druid:
-rake index RAILS_ENV=production target=revs_prod druid=oo000oo0001
-==== Index a list of druids from a pre-assembly run, a remeditaion run, or a simple CSV:
-rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.yaml log_type=preassembly  = preassembly run
-nohup rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.yaml log_type=preassembly &  = for a long running process, which will be most runs that have more than a few dozen druids, nohup it
-rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1_remediate.yaml log_type=remediate = remediation run
-rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander.csv log_type=csv = a simple csv file -- it must have a header line, with the header of "druid" definining the items you wish to index
-==== Index an entire collection, including the collection itself, along with all of its members (be sure to check the dor-fetcher-url parameter in the Rails environment you are running under to be sure it is connecting where you expect):
-rake collection_indexer RAILS_ENV=production target=revs_prod collection_druid=oo000oo0001
-nohup rake collection_indexer RAILS_ENV=production target=revs_prod collection_druid=oo000oo0001 &   = for a long running process, e.g. a collection with more than a few dozen druids, nohup it
-==== Re-Index Just Errored Out Items
-If you had errors when indexing from a preassembly/remediation log or from indexing an entire collection, you can re-run the errored out druids only with the log file.  All log files are kept in the log folder in the revs-indexer-service app.
-rake reindexer RAILS_ENV=production target=revs_prod file=log/logfile.log
-nohup rake reindexer RAILS_ENV=production target=revs_prod file=log/logfile.log & = probably no need to nohup unless there were alot of errors
-==== Delete Druids
-Delete a list of druids specified in a CSV/txt file.  Be careful, this will delete from all targets!  Put one druid per line, no header is necessary.
-rake delete_druids RAILS_ENV=production file=druid_list.txt
-==== Delete a single druid
-rake delete RAILS_ENV=production druid=oo000oo0001

data/lib/base_indexer/collection.rb DELETED Viewed

@@ -1,46 +0,0 @@
-module BaseIndexer
-  # It caches the collection information such as name and catkey
-  class Collection
-    def initialize(collection_druid)
-      @collection_druid = collection_druid
-    end
-    # Returns the collection name from cache, otherwise will fetch it from PURL.
-    #
-    # @param collection_druid [String]  is the druid for a collection e.g., ab123cd4567
-    # @return [Array<String>] the collection data or [] if there is no name and catkey or the object
-    #   is not a collection
-    def collection_info
-      from_cache || from_purl || {}
-    end
-    private
-    # @param [String] collection_druid is the druid for a collection e.g., ab123cd4567
-    # @return [String] return the collection label from cache if available, nil otherwise
-    def from_cache
-      Rails.cache.read(@collection_druid)
-    end
-    # @param [String] collection_druid is the druid for a collection e.g., ab123cd4567
-    # @return [String] return the collection label from purl if available, nil otherwise
-    def from_purl
-      return nil unless purl_model
-      return nil unless purl_model.is_collection
-      purl_data = { label: purl_model.label, ckey: purl_model.catkey }
-      Rails.cache.write(@collection_druid, purl_data, expires_in: 1.hours)
-      purl_data
-    end
-    def purl_model
-      @purl_model ||= begin
-        DiscoveryIndexer::InputXml::Purlxml.new(@collection_druid).load
-      rescue => e
-        Rails.logger.error "There is a problem in retrieving collection name and/or catkey for #{@collection_druid}. #{e.inspect}\n#{e.message }\n#{e.backtrace}"
-        nil
-      end
-    end
-  end
-end