discovery-indexer 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 41c0aca7bebdf8eea7c07d8c2d48944e8bcc88c4
4
- data.tar.gz: 5d0b42289e313ea24e32b4f0f66dc762c8475efa
3
+ metadata.gz: 94eb6c9bdbd29fc02f9aece9351e6c4af77a59b1
4
+ data.tar.gz: bb54745bb7c03fb7a60559e55cc7804db706cd8b
5
5
  SHA512:
6
- metadata.gz: 6d030fb91ba3fec33475e4c5fdd59aaed19e94b6178b6de53b4513be85f94655e420bb8c467bfcb8738f1e943e39b66856e10c7069a9513263d69b8eecaba2db
7
- data.tar.gz: 7c489649c3b0b34332108f05e80b75646863b8251a5a431ed8557dad955b5efe58ca0e6edab9326800b7c2a7021b04ccbc4e5e6c9a9283ace69f691c44d3f31e
6
+ metadata.gz: 80a631460ec997ab2c92b90836bca19ff8a4fc12ab70f7bbd70684cda9a152f29aa8a1d6c6431bb914a2ffc8a8ea7af7cad9196c00f6dda902dcdaaace1a8202
7
+ data.tar.gz: e02780cf225013328439cbe55b4c890ceb6fba82e77b9f3dfe98a617aaa591c310bc7f97365b1f0dfe4a11c16400e7484dd0b03222a7e59edaa56d5442521dfb
@@ -0,0 +1,48 @@
1
+ module DiscoveryIndexer
2
+
3
+ # It caches the collection information such as name and catkey
4
+ class Collection
5
+
6
+ attr_reader :druid
7
+ delegate :present?, to: :collection_info
8
+
9
+ def initialize(druid)
10
+ @druid = druid
11
+ end
12
+
13
+ def searchworks_id
14
+ collection_info[:ckey] || druid
15
+ end
16
+
17
+ def title
18
+ collection_info[:title]
19
+ end
20
+
21
+ private
22
+
23
+ # Returns the collection name from cache, otherwise will fetch it from PURL.
24
+ #
25
+ # @param collection_druid [String] is the druid for a collection e.g., ab123cd4567
26
+ # @return [Array<String>] the collection data or [] if there is no name and catkey or the object
27
+ # is not a collection
28
+ def collection_info
29
+ from_purl || {}
30
+ end
31
+
32
+ # @param [String] collection_druid is the druid for a collection e.g., ab123cd4567
33
+ # @return [String] return the collection label from purl if available, nil otherwise
34
+ def from_purl
35
+ return unless purl_model
36
+ { title: purl_model.label, ckey: purl_model.catkey }
37
+ end
38
+
39
+ def purl_model
40
+ @purl_model ||= begin
41
+ DiscoveryIndexer::InputXml::Purlxml.new(druid).load
42
+ rescue => e
43
+ DiscoveryIndexer::Logging.logger.error "There is a problem in retrieving collection name and/or catkey for #{druid}. #{e.inspect}\n#{e.message }\n#{e.backtrace}"
44
+ nil
45
+ end
46
+ end
47
+ end
48
+ end
File without changes
@@ -0,0 +1,44 @@
1
+ module DiscoveryIndexer
2
+ class GeneralMapper
3
+
4
+ attr_reader :druid
5
+
6
+ # Initializes an instance from IndexMapper
7
+ # @param [String] druid e.g. ab123cd4567
8
+ # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
9
+ # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
10
+ # @param [Hash] collection_data represents a hash of collection_druid and catkey
11
+ # collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
12
+ def initialize(druid)
13
+ @druid = druid
14
+ end
15
+
16
+ # Create a Hash representing a Solr doc, with all MODS related fields populated.
17
+ # @return [Hash] Hash representing the Solr document
18
+ def convert_to_solr_doc
19
+ solr_doc = {}
20
+ solr_doc[:id] = druid
21
+ solr_doc[:title] = modsxml.sw_full_title
22
+ solr_doc
23
+ end
24
+
25
+ # It converts collection_druids list to a hash with names. If the druid doesn't
26
+ # have a collection name, it will be excluded from the hash
27
+ # @return [Hash] a hash for collection druid and its name
28
+ # !{"ab123cd4567"=>"Collection 1", "ef123gh4567"=>"Collection 2"}
29
+ def collection_data
30
+ @collection_data ||= collection_druids.map do |cdruid|
31
+ DiscoveryIndexer::Collection.new(cdruid)
32
+ end
33
+ end
34
+ def collection_druids
35
+ purlxml.collection_druids
36
+ end
37
+ def modsxml
38
+ @modsxml ||= DiscoveryIndexer::InputXml::Modsxml.new(druid).load
39
+ end
40
+ def purlxml
41
+ @purlxml ||= DiscoveryIndexer::InputXml::Purlxml.new(druid).load
42
+ end
43
+ end
44
+ end
File without changes
@@ -25,15 +25,6 @@ module DiscoveryIndexer
25
25
 
26
26
  modsxml_model = Stanford::Mods::Record.new
27
27
  modsxml_model.from_nk_node(@modsxml_ng_doc)
28
- modsxml_model
29
- end
30
-
31
- # loads the mods xml to stanford mods model for the fedora object defind in the druid,
32
- # it reads the mods xml from PURL server with every call
33
- # @return [Stanford::Mods::Record] represents the mods xml
34
- def reload
35
- @modsxml_ng_doc = ModsxmlReader.read(@druid)
36
- load
37
28
  end
38
29
  end
39
30
  end
@@ -19,17 +19,7 @@ module DiscoveryIndexer
19
19
  # @return [PurlxmlModel] represents the purlxml
20
20
  def load
21
21
  @purlxml_ng_doc = PurlxmlReader.read(@druid) if @purlxml_ng_doc.nil?
22
- purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc)
23
- purlxml_model = purlxml_parser.parse
24
- purlxml_model
25
- end
26
-
27
- # loads the purl xml to purlxml model for the fedora object defind in the druid
28
- # it reads the purl xml from PURL server with every call
29
- # @return [PurlxmlModel] represents the purlxml
30
- def reload
31
- @purlxml_ng_doc = PurlxmlReader.read(@druid)
32
- load
22
+ purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc).parse
33
23
  end
34
24
  end
35
25
  end
@@ -1,12 +1,17 @@
1
1
  module DiscoveryIndexer
2
2
  module InputXml
3
- class PurlxmlParserStrict < PurlxmlParser
3
+ class PurlxmlParserStrict
4
4
  include DiscoveryIndexer::Logging
5
5
 
6
6
  RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
7
7
  OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
8
8
  MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
9
9
 
10
+ def initialize(druid, purlxml_ng_doc)
11
+ @purlxml_ng_doc = purlxml_ng_doc
12
+ @druid = druid
13
+ end
14
+
10
15
  # it parses the purlxml into a purlxml model
11
16
  # @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
12
17
  def parse
@@ -117,12 +122,9 @@ module DiscoveryIndexer
117
122
  ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => 'info:fedora/fedora-system:def/relations-external#', '' => '' }
118
123
  is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
119
124
  # from public_xml rels-ext
120
- druids = []
121
- is_member_of_nodes.each do |n|
122
- druids << n.value.split('druid:').last unless n.value.empty?
125
+ is_member_of_nodes.reject { |n| n.value.empty? }.map do |n|
126
+ n.value.split('druid:').last
123
127
  end
124
- return nil if druids.empty?
125
- druids
126
128
  end
127
129
 
128
130
  # the value of the type attribute for a DOR object's contentMetadata
@@ -9,10 +9,8 @@ module DiscoveryIndexer
9
9
  # @raise [MissingPublicXml] if there's no purl xml available for this druid
10
10
  def self.read(druid)
11
11
  purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
12
-
13
12
  begin
14
- purlxml_object = Nokogiri::XML(open(purlxml_uri))
15
- return purlxml_object
13
+ Nokogiri::XML(open(purlxml_uri))
16
14
  rescue
17
15
  raise DiscoveryIndexer::Errors::MissingPurlPage.new(purlxml_uri)
18
16
  end
@@ -1,3 +1,3 @@
1
1
  module DiscoveryIndexer
2
- VERSION = '0.10.1'
2
+ VERSION = '0.10.2'
3
3
  end
@@ -1,19 +1,16 @@
1
- require 'errors'
2
- require 'logging'
1
+ require 'discovery-indexer/errors'
2
+ require 'discovery-indexer/logging'
3
3
 
4
- require 'reader/purlxml'
5
- require 'reader/purlxml_reader'
6
- require 'reader/purlxml_parser'
7
- require 'reader/purlxml_parser_strict'
8
- require 'reader/purlxml_model'
4
+ require 'discovery-indexer/reader/purlxml'
5
+ require 'discovery-indexer/reader/purlxml_reader'
6
+ require 'discovery-indexer/reader/purlxml_parser_strict'
7
+ require 'discovery-indexer/reader/purlxml_model'
9
8
 
10
- require 'reader/modsxml'
11
- require 'reader/modsxml_reader'
9
+ require 'discovery-indexer/reader/modsxml'
10
+ require 'discovery-indexer/reader/modsxml_reader'
12
11
 
13
- require 'mapper/general_mapper'
14
-
15
- require 'writer/solr_client'
16
- require 'writer/solr_writer'
12
+ require 'discovery-indexer/general_mapper'
13
+ require 'discovery-indexer/collection'
17
14
 
18
15
  # require 'utilities/extract_sub_targets'
19
16
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discovery-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.1
4
+ version: 0.10.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ahmed AlSum
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-11-11 00:00:00.000000000 Z
12
+ date: 2015-12-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -81,6 +81,20 @@ dependencies:
81
81
  - - ">="
82
82
  - !ruby/object:Gem::Version
83
83
  version: '0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: rake
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ type: :development
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
84
98
  - !ruby/object:Gem::Dependency
85
99
  name: rspec
86
100
  requirement: !ruby/object:Gem::Requirement
@@ -145,19 +159,17 @@ extensions: []
145
159
  extra_rdoc_files: []
146
160
  files:
147
161
  - lib/discovery-indexer.rb
148
- - lib/errors.rb
149
- - lib/logging.rb
150
- - lib/mapper/general_mapper.rb
151
- - lib/reader/modsxml.rb
152
- - lib/reader/modsxml_reader.rb
153
- - lib/reader/purlxml.rb
154
- - lib/reader/purlxml_model.rb
155
- - lib/reader/purlxml_parser.rb
156
- - lib/reader/purlxml_parser_strict.rb
157
- - lib/reader/purlxml_reader.rb
158
- - lib/version.rb
159
- - lib/writer/solr_client.rb
160
- - lib/writer/solr_writer.rb
162
+ - lib/discovery-indexer/collection.rb
163
+ - lib/discovery-indexer/errors.rb
164
+ - lib/discovery-indexer/general_mapper.rb
165
+ - lib/discovery-indexer/logging.rb
166
+ - lib/discovery-indexer/reader/modsxml.rb
167
+ - lib/discovery-indexer/reader/modsxml_reader.rb
168
+ - lib/discovery-indexer/reader/purlxml.rb
169
+ - lib/discovery-indexer/reader/purlxml_model.rb
170
+ - lib/discovery-indexer/reader/purlxml_parser_strict.rb
171
+ - lib/discovery-indexer/reader/purlxml_reader.rb
172
+ - lib/discovery-indexer/version.rb
161
173
  homepage:
162
174
  licenses:
163
175
  - Stanford University
@@ -1,27 +0,0 @@
1
- module DiscoveryIndexer
2
- module Mapper
3
- class GeneralMapper
4
- # Initializes an instance from IndexMapper
5
- # @param [String] druid e.g. ab123cd4567
6
- # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
7
- # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
8
- # @param [Hash] collection_data represents a hash of collection_druid and catkey
9
- # e.g. @collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
10
- def initialize(druid, modsxml, purlxml, collection_data = {})
11
- @druid = druid
12
- @modsxml = modsxml
13
- @purlxml = purlxml
14
- @collection_data = collection_data
15
- end
16
-
17
- # Create a Hash representing a Solr doc, with all MODS related fields populated.
18
- # @return [Hash] Hash representing the Solr document
19
- def convert_to_solr_doc
20
- solr_doc = {}
21
- solr_doc[:id] = @druid
22
- solr_doc[:title] = @modsxml.sw_full_title
23
- solr_doc
24
- end
25
- end
26
- end
27
- end
@@ -1,13 +0,0 @@
1
- module DiscoveryIndexer
2
- module InputXml
3
- class PurlxmlParser
4
- def initialize(druid, purlxml_ng_doc)
5
- @purlxml_ng_doc = purlxml_ng_doc
6
- @druid = druid
7
- end
8
-
9
- def parse
10
- end
11
- end
12
- end
13
- end
@@ -1,113 +0,0 @@
1
- require 'retries'
2
- require 'rsolr'
3
- require 'rest-client'
4
- module DiscoveryIndexer
5
- module Writer
6
- # Processes adds and deletes to the solr core
7
- class SolrClient
8
- include DiscoveryIndexer::Logging
9
-
10
- # Add the document to solr, retry if an error occurs.
11
- # See https://github.com/ooyala/retries for docs on with_retries.
12
- # @param id [String] the document id, usually it will be druid.
13
- # @param solr_doc [Hash] a Hash representation of the solr document
14
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
15
- # @param max_retries [Integer] the maximum number of tries before fail
16
- def self.add(id, solr_doc, solr_connector, max_retries = 10)
17
- process(id, solr_doc, solr_connector, max_retries, false)
18
- end
19
-
20
- # Add the document to solr, retry if an error occurs.
21
- # See https://github.com/ooyala/retries for docs on with_retries.
22
- # @param id [String] the document id, usually it will be druid.
23
- # @param solr_connector[RSolr::Client] is an open connection with the solr core
24
- # @param max_retries [Integer] the maximum number of tries before fail
25
- def self.delete(id, solr_connector, max_retries = 10)
26
- process(id, {}, solr_connector, max_retries, true)
27
- end
28
-
29
- # It's an internal method that receives all the requests and deal with
30
- # SOLR core. This method can call add, delete, or update
31
- #
32
- # @param id [String] the document id, usually it will be druid.
33
- # @param solr_doc [Hash] is the solr doc in hash format
34
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
35
- # @param max_retries [Integer] the maximum number of tries before fail
36
- def self.process(id, solr_doc, solr_connector, max_retries, is_delete = false)
37
- handler = proc do |exception, attempt_number, _total_delay|
38
- DiscoveryIndexer::Logging.logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
39
- end
40
-
41
- with_retries(max_tries: max_retries, handler: handler, base_sleep_seconds: 1, max_sleep_seconds: 5) do |attempt|
42
- DiscoveryIndexer::Logging.logger.debug "Attempt #{attempt} for #{id}"
43
-
44
- if is_delete
45
- DiscoveryIndexer::Logging.logger.info "Deleting #{id} on attempt #{attempt}"
46
- solr_connector.delete_by_id(id, :add_attributes => {:commitWithin => 10000})
47
- elsif allow_update?(solr_connector) && doc_exists?(id, solr_connector)
48
- DiscoveryIndexer::Logging.logger.info "Updating #{id} on attempt #{attempt}"
49
- update_solr_doc(id, solr_doc, solr_connector)
50
- else
51
- DiscoveryIndexer::Logging.logger.info "Indexing #{id} on attempt #{attempt}"
52
- solr_connector.add(solr_doc, :add_attributes => {:commitWithin => 10000})
53
- end
54
- #solr_connector.commit
55
- DiscoveryIndexer::Logging.logger.info "Completing #{id} successfully on attempt #{attempt}"
56
- end
57
- end
58
-
59
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
60
- # @return [Boolean] true if the solr core allowing update feature
61
- def self.allow_update?(solr_connector)
62
- solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
63
- end
64
-
65
- # @param id [String] the document id, usually it will be druid.
66
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
67
- # @return [Boolean] true if the solr doc defined by this id exists
68
- def self.doc_exists?(id, solr_connector)
69
- response = solr_connector.get 'select', params: { q: 'id:"' + id + '"' }
70
- response['response']['numFound'] == 1
71
- end
72
-
73
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
74
- # send hard commit to solr
75
- def self.commit(solr_connector)
76
- RestClient.post self.solr_url(solr_connector), {},:content_type => :json, :accept=>:json
77
- end
78
-
79
- # It is an internal method that updates the solr doc instead of adding a new one.
80
- # @param id [String] the document id, usually it will be druid.
81
- # @param solr_doc [Hash] is the solr doc in hash format
82
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
83
- def self.update_solr_doc(id, solr_doc, solr_connector)
84
- # update_solr_doc can't used RSolr because updating hash doc is not supported
85
- # so we need to build the json input manually
86
- params = "[{\"id\":\"#{id}\","
87
- solr_doc.each do |field_name, new_values|
88
- next if field_name == :id
89
- params += "\"#{field_name}\":"
90
- new_values = [new_values] unless new_values.class == Array
91
- new_values = new_values.map { |s| s.to_s.gsub('\\', '\\\\\\').gsub('"', '\"').strip } # strip leading/trailing spaces and escape quotes for each value
92
- params += "{\"set\":[\"#{new_values.join('","')}\"]},"
93
- end
94
- params.chomp!(',')
95
- params += '}]'
96
- RestClient.post self.solr_url(solr_connector), params, content_type: :json, accept: :json
97
- end
98
-
99
- # adjust the solr_url so it works with or without a trailing /
100
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
101
- # @return [String] the solr URL
102
- def self.solr_url(solr_connector)
103
- solr_url = solr_connector.options[:url]
104
- if solr_url.end_with?('/')
105
- "#{solr_url}update?commit=true"
106
- else
107
- "#{solr_url}/update?commit=true"
108
- end
109
- end
110
-
111
- end
112
- end
113
- end
@@ -1,54 +0,0 @@
1
- require 'retries'
2
- require 'rsolr'
3
-
4
- module DiscoveryIndexer
5
- module Writer
6
- # Performs writes to solr client based upon true and false release flags
7
- class SolrWriter
8
- attr_reader :solr_targets_configs
9
-
10
- include DiscoveryIndexer::Logging
11
-
12
- def process(id, index_doc, targets, targets_configs)
13
- @solr_targets_configs = targets_configs
14
- index_targets = targets.select { |_, b| b }.keys
15
- delete_targets = targets.reject { |_, b| b }.keys
16
-
17
- # get targets with true
18
- solr_index_client(id, index_doc, index_targets) if index_targets.present?
19
- # get targets with false
20
- solr_delete_client(id, delete_targets) if delete_targets.present?
21
- end
22
-
23
- def solr_delete_from_all(id, targets_configs)
24
- # Get a list of all registered targets
25
- @solr_targets_configs = targets_configs
26
- targets = solr_targets_configs.keys
27
- solr_delete_client(id, targets)
28
- end
29
-
30
- def solr_index_client(id, index_doc, targets)
31
- targets.each do |solr_target|
32
- solr_connector = get_connector_for_target(solr_target)
33
- SolrClient.add(id, index_doc, solr_connector) unless solr_connector.nil?
34
- end
35
- end
36
-
37
- def solr_delete_client(id, targets)
38
- targets.each do |solr_target|
39
- solr_connector = get_connector_for_target(solr_target)
40
- SolrClient.delete(id, solr_connector) unless solr_connector.nil?
41
- end
42
- end
43
-
44
- def get_connector_for_target(solr_target)
45
- solr_connector = nil
46
- if solr_targets_configs.keys.include?(solr_target)
47
- config = solr_targets_configs[solr_target]
48
- solr_connector = RSolr.connect(config.deep_symbolize_keys)
49
- end
50
- solr_connector
51
- end
52
- end
53
- end
54
- end