base_indexer 0.6.1 → 0.6.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cfc68bc2f274b9751be97b344ad79f82fce07bf8
4
- data.tar.gz: be150decf5e3f61db3d38604e33908c79b72f5be
3
+ metadata.gz: 1fd0a69da41b249eeebbea882c38c0de1fe0bd70
4
+ data.tar.gz: 53d6ed530f7463518f0c7621ea50354bf9edb185
5
5
  SHA512:
6
- metadata.gz: 5f740995b696b60d857f82aefeece318b52da8ef1f1014858a7bbf28b936060bf1518db44a66405c93f668bbde09175f1eb3d3d7bca9041abb38a24d8122c2a6
7
- data.tar.gz: 3eb96fa4ed705891253287c539f870fce5cfd9a57d6581f53cdcec1d50ec631753fc35b093921a5b14bf99fceb1cfd8c73eef3c5047adcd6bcd577b35298953a
6
+ metadata.gz: 36315c92a7aa60a2b2d3414cc2162771e3eadcfb1c765560ae4756f1256407d05c43f51dd3957f593238b8645acfdb2ea4e9f4e7f998160b098deb513c86a347
7
+ data.tar.gz: 0bc53b7fa053c1b1a4c4c446451e9741cf4dcf4bb72f0e6d071ca2de241827ac0f7a1c4393d16a0df4e137241c081f64b86d63e4dcc4ce5bee100a3707d3c06a
@@ -5,4 +5,4 @@ BaseIndexer.indexer_class = 'BaseIndexer::MainIndexerEngine'
5
5
  BaseIndexer.solr_configuration_class_name = 'BaseIndexer::SolrConfigurationFromFile'
6
6
  # BaseIndexer.solr_configuration_class.constantize.new(Rails.configuration.solr_config_file_path)
7
7
  BaseIndexer.mapper_class_name = 'DiscoveryIndexer::Mapper::GeneralMapper'
8
- BaseIndexer.solr_writer_class_name = 'DiscoveryIndexer::Writer::SolrWriter'
8
+ BaseIndexer.solr_writer_class_name = 'BaseIndexer::Solr::Writer'
@@ -26,27 +26,12 @@ module BaseIndexer
26
26
  #
27
27
  # @raise it will raise erros if there is any problems happen in any level
28
28
  def index(druid, targets = nil)
29
- # Read input mods and purl
30
- purl_model = read_purl(druid)
31
- mods_model = read_mods(druid)
32
- collection_data = collection_data(purl_model.collection_druids)
33
-
34
29
  # Map the input to solr_doc
35
- solr_doc = BaseIndexer.mapper_class_name.constantize.new(druid, mods_model, purl_model, collection_data).convert_to_solr_doc
36
-
37
- # Get target list
38
- targets_hash = {}
39
- if targets.present?
40
- targets_hash = targets_hash_from_param(targets)
41
- else
42
- targets_hash = purl_model.release_tags_hash
43
- end
44
-
45
- targets_hash = update_targets_before_write(targets_hash, purl_model)
30
+ solr_doc = BaseIndexer.mapper_class_name.constantize.new(druid).convert_to_solr_doc
46
31
 
47
32
  # Get SOLR configuration and write
48
33
  solr_targets_configs = BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash
49
- BaseIndexer.solr_writer_class_name.constantize.new.process(druid, solr_doc, targets_hash, solr_targets_configs)
34
+ BaseIndexer.solr_writer_class_name.constantize.new.process(druid, solr_doc, targets, solr_targets_configs)
50
35
  end
51
36
 
52
37
  # It deletes an item defined by druid from all registered solr core
@@ -56,54 +41,5 @@ module BaseIndexer
56
41
  BaseIndexer.solr_writer_class_name.constantize.new.solr_delete_from_all(druid, solr_targets_configs)
57
42
  end
58
43
 
59
- def read_purl(druid)
60
- DiscoveryIndexer::InputXml::Purlxml.new(druid).load
61
- end
62
-
63
- def read_mods(druid)
64
- DiscoveryIndexer::InputXml::Modsxml.new(druid).load
65
- end
66
-
67
- # It converts targets array to targets hash
68
- # @param targets [Array] a list of specfic targets
69
- # @return [Hash] a hash of targets with true value
70
- # @example convert target list
71
- # targets_hash_from_param( ["searchworks","revs"] )
72
- # {"searchworks"=>true, "revs"=>true}
73
- def targets_hash_from_param(targets)
74
- targets_hash = {}
75
- unless targets.nil?
76
- targets.each do |target|
77
- targets_hash[target] = true
78
- end
79
- end
80
- targets_hash
81
- end
82
-
83
- # It allows the consumer to modify the targets list before doing the final writing
84
- # to the solr core. Default behavior returns the targets_hash as it is
85
- # @param targets_hash [Hash] a hash of targets with true value
86
- # @param purl_model [DiscoveryIndexer::Reader::PurlxmlModel] represents the purlxml model
87
- # @return [Hash] a hash of targets
88
- def update_targets_before_write(targets_hash, _purl_model)
89
- targets_hash
90
- end
91
-
92
- # It converts collection_druids list to a hash with names. If the druid doesn't
93
- # have a collection name, it will be excluded from the hash
94
- # @param collection_druids [Array] a list of druids
95
- # !["ab123cd4567", "ef123gh4567"]
96
- # @return [Hash] a hash for collection druid and its name
97
- # !{"ab123cd4567"=>"Collection 1", "ef123gh4567"=>"Collection 2"}
98
- def collection_data(collection_druids)
99
- collection_data = {}
100
- unless collection_druids.nil?
101
- collection_druids.each do |cdruid|
102
- cdata = BaseIndexer::Collection.new(cdruid).collection_info
103
- collection_data[cdruid] = cdata if cdata.present?
104
- end
105
- end
106
- collection_data
107
- end
108
44
  end
109
45
  end
@@ -0,0 +1,113 @@
1
+ require 'retries'
2
+ require 'rsolr'
3
+ require 'rest-client'
4
+ module BaseIndexer
5
+ module Solr
6
+ # Processes adds and deletes to the solr core
7
+ class Client
8
+ include DiscoveryIndexer::Logging
9
+
10
+ # Add the document to solr, retry if an error occurs.
11
+ # See https://github.com/ooyala/retries for docs on with_retries.
12
+ # @param id [String] the document id, usually it will be druid.
13
+ # @param solr_doc [Hash] a Hash representation of the solr document
14
+ # @param solr_connector [RSolr::Client] is an open connection with the solr core
15
+ # @param max_retries [Integer] the maximum number of tries before fail
16
+ def self.add(id, solr_doc, solr_connector, max_retries = 10)
17
+ process(id, solr_doc, solr_connector, max_retries, false)
18
+ end
19
+
20
+ # Add the document to solr, retry if an error occurs.
21
+ # See https://github.com/ooyala/retries for docs on with_retries.
22
+ # @param id [String] the document id, usually it will be druid.
23
+ # @param solr_connector[RSolr::Client] is an open connection with the solr core
24
+ # @param max_retries [Integer] the maximum number of tries before fail
25
+ def self.delete(id, solr_connector, max_retries = 10)
26
+ process(id, {}, solr_connector, max_retries, true)
27
+ end
28
+
29
+ # It's an internal method that receives all the requests and deal with
30
+ # SOLR core. This method can call add, delete, or update
31
+ #
32
+ # @param id [String] the document id, usually it will be druid.
33
+ # @param solr_doc [Hash] is the solr doc in hash format
34
+ # @param solr_connector [RSolr::Client] is an open connection with the solr core
35
+ # @param max_retries [Integer] the maximum number of tries before fail
36
+ def self.process(id, solr_doc, solr_connector, max_retries, is_delete = false)
37
+ handler = proc do |exception, attempt_number, _total_delay|
38
+ DiscoveryIndexer::Logging.logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
39
+ end
40
+
41
+ with_retries(max_tries: max_retries, handler: handler, base_sleep_seconds: 1, max_sleep_seconds: 5) do |attempt|
42
+ DiscoveryIndexer::Logging.logger.debug "Attempt #{attempt} for #{id}"
43
+
44
+ if is_delete
45
+ DiscoveryIndexer::Logging.logger.info "Deleting #{id} on attempt #{attempt}"
46
+ solr_connector.delete_by_id(id, :add_attributes => {:commitWithin => 10000})
47
+ elsif allow_update?(solr_connector) && doc_exists?(id, solr_connector)
48
+ DiscoveryIndexer::Logging.logger.info "Updating #{id} on attempt #{attempt}"
49
+ update_solr_doc(id, solr_doc, solr_connector)
50
+ else
51
+ DiscoveryIndexer::Logging.logger.info "Indexing #{id} on attempt #{attempt}"
52
+ solr_connector.add(solr_doc, :add_attributes => {:commitWithin => 10000})
53
+ end
54
+ #solr_connector.commit
55
+ DiscoveryIndexer::Logging.logger.info "Completing #{id} successfully on attempt #{attempt}"
56
+ end
57
+ end
58
+
59
+ # @param solr_connector [RSolr::Client] is an open connection with the solr core
60
+ # @return [Boolean] true if the solr core allowing update feature
61
+ def self.allow_update?(solr_connector)
62
+ solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
63
+ end
64
+
65
+ # @param id [String] the document id, usually it will be druid.
66
+ # @param solr_connector [RSolr::Client] is an open connection with the solr core
67
+ # @return [Boolean] true if the solr doc defined by this id exists
68
+ def self.doc_exists?(id, solr_connector)
69
+ response = solr_connector.get 'select', params: { q: 'id:"' + id + '"' }
70
+ response['response']['numFound'] == 1
71
+ end
72
+
73
+ # @param solr_connector [RSolr::Client] is an open connection with the solr core
74
+ # send hard commit to solr
75
+ def self.commit(solr_connector)
76
+ RestClient.post self.solr_url(solr_connector), {},:content_type => :json, :accept=>:json
77
+ end
78
+
79
+ # It is an internal method that updates the solr doc instead of adding a new one.
80
+ # @param id [String] the document id, usually it will be druid.
81
+ # @param solr_doc [Hash] is the solr doc in hash format
82
+ # @param solr_connector [RSolr::Client] is an open connection with the solr core
83
+ def self.update_solr_doc(id, solr_doc, solr_connector)
84
+ # update_solr_doc can't used RSolr because updating hash doc is not supported
85
+ # so we need to build the json input manually
86
+ params = "[{\"id\":\"#{id}\","
87
+ solr_doc.each do |field_name, new_values|
88
+ next if field_name == :id
89
+ params += "\"#{field_name}\":"
90
+ new_values = [new_values] unless new_values.class == Array
91
+ new_values = new_values.map { |s| s.to_s.gsub('\\', '\\\\\\').gsub('"', '\"').strip } # strip leading/trailing spaces and escape quotes for each value
92
+ params += "{\"set\":[\"#{new_values.join('","')}\"]},"
93
+ end
94
+ params.chomp!(',')
95
+ params += '}]'
96
+ RestClient.post self.solr_url(solr_connector), params, content_type: :json, accept: :json
97
+ end
98
+
99
+ # adjust the solr_url so it works with or without a trailing /
100
+ # @param solr_connector [RSolr::Client] is an open connection with the solr core
101
+ # @return [String] the solr URL
102
+ def self.solr_url(solr_connector)
103
+ solr_url = solr_connector.options[:url]
104
+ if solr_url.end_with?('/')
105
+ "#{solr_url}update?commit=true"
106
+ else
107
+ "#{solr_url}/update?commit=true"
108
+ end
109
+ end
110
+
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,54 @@
1
+ require 'retries'
2
+ require 'rsolr'
3
+
4
+ module BaseIndexer
5
+ module Solr
6
+ # Performs writes to solr client based upon true and false release flags
7
+ class Writer
8
+ attr_reader :solr_targets_configs
9
+
10
+ include DiscoveryIndexer::Logging
11
+
12
+ def process(id, index_doc, targets, targets_configs)
13
+ @solr_targets_configs = targets_configs
14
+ index_targets = targets.select { |_, b| b }.keys
15
+ delete_targets = targets.reject { |_, b| b }.keys
16
+
17
+ # get targets with true
18
+ solr_index_client(id, index_doc, index_targets) if index_targets.present?
19
+ # get targets with false
20
+ solr_delete_client(id, delete_targets) if delete_targets.present?
21
+ end
22
+
23
+ def solr_delete_from_all(id, targets_configs)
24
+ # Get a list of all registered targets
25
+ @solr_targets_configs = targets_configs
26
+ targets = solr_targets_configs.keys
27
+ solr_delete_client(id, targets)
28
+ end
29
+
30
+ def solr_index_client(id, index_doc, targets)
31
+ targets.each do |solr_target|
32
+ solr_connector = get_connector_for_target(solr_target)
33
+ Client.add(id, index_doc, solr_connector) unless solr_connector.nil?
34
+ end
35
+ end
36
+
37
+ def solr_delete_client(id, targets)
38
+ targets.each do |solr_target|
39
+ solr_connector = get_connector_for_target(solr_target)
40
+ Client.delete(id, solr_connector) unless solr_connector.nil?
41
+ end
42
+ end
43
+
44
+ def get_connector_for_target(solr_target)
45
+ solr_connector = nil
46
+ if solr_targets_configs.keys.include?(solr_target)
47
+ config = solr_targets_configs[solr_target]
48
+ solr_connector = RSolr.connect(config.deep_symbolize_keys)
49
+ end
50
+ solr_connector
51
+ end
52
+ end
53
+ end
54
+ end
@@ -1,3 +1,3 @@
1
1
  module BaseIndexer
2
- VERSION = '0.6.1'
2
+ VERSION = '0.6.2'
3
3
  end
data/lib/base_indexer.rb CHANGED
@@ -1,9 +1,10 @@
1
1
  require 'base_indexer/engine'
2
2
 
3
3
  require 'base_indexer/main_indexer_engine'
4
- require 'base_indexer/solr/solr_configuration'
5
- require 'base_indexer/solr/solr_configuration_from_file'
6
- require 'base_indexer/collection'
4
+ require 'base_indexer/config/solr_configuration'
5
+ require 'base_indexer/config/solr_configuration_from_file'
6
+ require 'base_indexer/solr/client'
7
+ require 'base_indexer/solr/writer'
7
8
  require 'discovery-indexer'
8
9
  module BaseIndexer
9
10
  mattr_accessor :indexer_class
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: base_indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 0.6.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ahmed Alsum
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-11-11 00:00:00.000000000 Z
12
+ date: 2015-12-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rails
@@ -220,7 +220,6 @@ executables: []
220
220
  extensions: []
221
221
  extra_rdoc_files: []
222
222
  files:
223
- - README.rdoc
224
223
  - Rakefile
225
224
  - app/controllers/base_indexer/about_controller.rb
226
225
  - app/controllers/base_indexer/application_controller.rb
@@ -235,11 +234,12 @@ files:
235
234
  - config/initializers/is_it_working.rb
236
235
  - config/routes.rb
237
236
  - lib/base_indexer.rb
238
- - lib/base_indexer/collection.rb
237
+ - lib/base_indexer/config/solr_configuration.rb
238
+ - lib/base_indexer/config/solr_configuration_from_file.rb
239
239
  - lib/base_indexer/engine.rb
240
240
  - lib/base_indexer/main_indexer_engine.rb
241
- - lib/base_indexer/solr/solr_configuration.rb
242
- - lib/base_indexer/solr/solr_configuration_from_file.rb
241
+ - lib/base_indexer/solr/client.rb
242
+ - lib/base_indexer/solr/writer.rb
243
243
  - lib/base_indexer/version.rb
244
244
  - lib/generators/base_indexer/install_generator.rb
245
245
  - lib/generators/base_indexer/templates/solr.yml
data/README.rdoc DELETED
@@ -1,93 +0,0 @@
1
- {<img src="https://travis-ci.org/sul-dlss/base_indexer.svg?branch=master" alt="Build Status" />}[https://travis-ci.org/sul-dlss/base_indexer] {<img src="https://coveralls.io/repos/sul-dlss/base_indexer/badge.svg" alt="Coverage Status" />}[https://coveralls.io/r/sul-dlss/base_indexer]
2
-
3
-
4
- = BaseIndexer
5
-
6
- This project rocks and uses MIT-LICENSE.
7
-
8
- == Running tests
9
-
10
- Clone from github.
11
- rake # first time setup and to generate all docs
12
- bundle exec rake spec # just run the tests next time around
13
-
14
-
15
- == Steps to hook the base_indexer engine in your app
16
- * Generate new rails app
17
- rails new my_indexer_app
18
-
19
- *Edit Gemfile and add the base_indexer gem name
20
- gem 'base_indexer'
21
-
22
- * Run bundle install to download the gem
23
- bundle install
24
-
25
- * Mount the engine in your favorite domain.
26
- mount BaseIndexer::Engine, at: '/items'
27
-
28
- == Basic configuration
29
- The engine is looking for the following values
30
-
31
- config.solr_config_file_path = "#{config.root}/config/solr.yml"
32
- DiscoveryIndexer::PURL_DEFAULT='https://purl.stanford.edu'
33
-
34
-
35
- == Advanced features
36
-
37
- The engine gives the developer the ability to extend any of its classes
38
-
39
- To extend any of indexer features (purl-reader, mods-reader, mapper, solr-writer)
40
-
41
- 1. Create a new class that inherits from BaseIndexer::MainIndexerEngine
42
- 2. Create a new file named config/initializers/base_indexer.rb
43
- 3. In this file, add the following line. replace 'MyIndexerClassName' with the fully qualifed actual class name. The name should be between double qoutes
44
- BaseIndexer.indexer_class = "MyIndexerClassName"
45
- 4. In the new indexer class, you can override any of the functions that you need to change its implementation. For example, if you need to use a new mapper, you will override map function.
46
-
47
- To extend mapper functionality.
48
- 1. Create a new class e.g., MyMapper that inherits from GeneralMapper or IndexMapper.
49
- 2. Implement MyMapper.map to converts the input to solr doc hash.
50
- 3. Override MyIndexerClassName.map to call your new class instead of the default one.
51
-
52
- == Rake Tasks For Indexing Druids
53
-
54
- All rake tasks that perform batch indexing will generate log files in the "log" folder within the app itself. You can tail the log file to watch the progress. The
55
- log file is also useful since you can pass it to the "reindexer" rake task to retry just the errored out druids. The name of the log file will depend on which
56
- rake task you are running, and will be timestamped to be unique.
57
-
58
- ==== Index a single druid:
59
-
60
- rake index RAILS_ENV=production target=revs_prod druid=oo000oo0001
61
-
62
- ==== Index a list of druids from a pre-assembly run, a remeditaion run, or a simple CSV:
63
-
64
- rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.yaml log_type=preassembly = preassembly run
65
- nohup rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.yaml log_type=preassembly & = for a long running process, which will be most runs that have more than a few dozen druids, nohup it
66
-
67
- rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1_remediate.yaml log_type=remediate = remediation run
68
-
69
- rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander.csv log_type=csv = a simple csv file -- it must have a header line, with the header of "druid" definining the items you wish to index
70
-
71
- ==== Index an entire collection, including the collection itself, along with all of its members (be sure to check the dor-fetcher-url parameter in the Rails environment you are running under to be sure it is connecting where you expect):
72
-
73
- rake collection_indexer RAILS_ENV=production target=revs_prod collection_druid=oo000oo0001
74
- nohup rake collection_indexer RAILS_ENV=production target=revs_prod collection_druid=oo000oo0001 & = for a long running process, e.g. a collection with more than a few dozen druids, nohup it
75
-
76
- ==== Re-Index Just Errored Out Items
77
-
78
- If you had errors when indexing from a preassembly/remediation log or from indexing an entire collection, you can re-run the errored out druids only with the log file. All log files are kept in the log folder in the revs-indexer-service app.
79
-
80
- rake reindexer RAILS_ENV=production target=revs_prod file=log/logfile.log
81
-
82
- nohup rake reindexer RAILS_ENV=production target=revs_prod file=log/logfile.log & = probably no need to nohup unless there were alot of errors
83
-
84
-
85
- ==== Delete Druids
86
-
87
- Delete a list of druids specified in a CSV/txt file. Be careful, this will delete from all targets! Put one druid per line, no header is necessary.
88
-
89
- rake delete_druids RAILS_ENV=production file=druid_list.txt
90
-
91
- ==== Delete a single druid
92
-
93
- rake delete RAILS_ENV=production druid=oo000oo0001
@@ -1,46 +0,0 @@
1
- module BaseIndexer
2
-
3
- # It caches the collection information such as name and catkey
4
- class Collection
5
-
6
- def initialize(collection_druid)
7
- @collection_druid = collection_druid
8
- end
9
-
10
- # Returns the collection name from cache, otherwise will fetch it from PURL.
11
- #
12
- # @param collection_druid [String] is the druid for a collection e.g., ab123cd4567
13
- # @return [Array<String>] the collection data or [] if there is no name and catkey or the object
14
- # is not a collection
15
- def collection_info
16
- from_cache || from_purl || {}
17
- end
18
-
19
- private
20
-
21
- # @param [String] collection_druid is the druid for a collection e.g., ab123cd4567
22
- # @return [String] return the collection label from cache if available, nil otherwise
23
- def from_cache
24
- Rails.cache.read(@collection_druid)
25
- end
26
-
27
- # @param [String] collection_druid is the druid for a collection e.g., ab123cd4567
28
- # @return [String] return the collection label from purl if available, nil otherwise
29
- def from_purl
30
- return nil unless purl_model
31
- return nil unless purl_model.is_collection
32
- purl_data = { label: purl_model.label, ckey: purl_model.catkey }
33
- Rails.cache.write(@collection_druid, purl_data, expires_in: 1.hours)
34
- purl_data
35
- end
36
-
37
- def purl_model
38
- @purl_model ||= begin
39
- DiscoveryIndexer::InputXml::Purlxml.new(@collection_druid).load
40
- rescue => e
41
- Rails.logger.error "There is a problem in retrieving collection name and/or catkey for #{@collection_druid}. #{e.inspect}\n#{e.message }\n#{e.backtrace}"
42
- nil
43
- end
44
- end
45
- end
46
- end