discovery-indexer 0.10.1 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 41c0aca7bebdf8eea7c07d8c2d48944e8bcc88c4
4
- data.tar.gz: 5d0b42289e313ea24e32b4f0f66dc762c8475efa
3
+ metadata.gz: 94eb6c9bdbd29fc02f9aece9351e6c4af77a59b1
4
+ data.tar.gz: bb54745bb7c03fb7a60559e55cc7804db706cd8b
5
5
  SHA512:
6
- metadata.gz: 6d030fb91ba3fec33475e4c5fdd59aaed19e94b6178b6de53b4513be85f94655e420bb8c467bfcb8738f1e943e39b66856e10c7069a9513263d69b8eecaba2db
7
- data.tar.gz: 7c489649c3b0b34332108f05e80b75646863b8251a5a431ed8557dad955b5efe58ca0e6edab9326800b7c2a7021b04ccbc4e5e6c9a9283ace69f691c44d3f31e
6
+ metadata.gz: 80a631460ec997ab2c92b90836bca19ff8a4fc12ab70f7bbd70684cda9a152f29aa8a1d6c6431bb914a2ffc8a8ea7af7cad9196c00f6dda902dcdaaace1a8202
7
+ data.tar.gz: e02780cf225013328439cbe55b4c890ceb6fba82e77b9f3dfe98a617aaa591c310bc7f97365b1f0dfe4a11c16400e7484dd0b03222a7e59edaa56d5442521dfb
@@ -0,0 +1,48 @@
1
+ module DiscoveryIndexer
2
+
3
+ # It caches the collection information such as name and catkey
4
+ class Collection
5
+
6
+ attr_reader :druid
7
+ delegate :present?, to: :collection_info
8
+
9
+ def initialize(druid)
10
+ @druid = druid
11
+ end
12
+
13
+ def searchworks_id
14
+ collection_info[:ckey] || druid
15
+ end
16
+
17
+ def title
18
+ collection_info[:title]
19
+ end
20
+
21
+ private
22
+
23
+ # Returns the collection name from cache, otherwise will fetch it from PURL.
24
+ #
25
+ # @param collection_druid [String] is the druid for a collection e.g., ab123cd4567
26
+ # @return [Array<String>] the collection data or [] if there is no name and catkey or the object
27
+ # is not a collection
28
+ def collection_info
29
+ from_purl || {}
30
+ end
31
+
32
+ # @param [String] collection_druid is the druid for a collection e.g., ab123cd4567
33
+ # @return [String] return the collection label from purl if available, nil otherwise
34
+ def from_purl
35
+ return unless purl_model
36
+ { title: purl_model.label, ckey: purl_model.catkey }
37
+ end
38
+
39
+ def purl_model
40
+ @purl_model ||= begin
41
+ DiscoveryIndexer::InputXml::Purlxml.new(druid).load
42
+ rescue => e
43
+ DiscoveryIndexer::Logging.logger.error "There is a problem in retrieving collection name and/or catkey for #{druid}. #{e.inspect}\n#{e.message }\n#{e.backtrace}"
44
+ nil
45
+ end
46
+ end
47
+ end
48
+ end
File without changes
@@ -0,0 +1,44 @@
1
+ module DiscoveryIndexer
2
+ class GeneralMapper
3
+
4
+ attr_reader :druid
5
+
6
+ # Initializes an instance from IndexMapper
7
+ # @param [String] druid e.g. ab123cd4567
8
+ # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
9
+ # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
10
+ # @param [Hash] collection_data represents a hash of collection_druid and catkey
11
+ # collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
12
+ def initialize(druid)
13
+ @druid = druid
14
+ end
15
+
16
+ # Create a Hash representing a Solr doc, with all MODS related fields populated.
17
+ # @return [Hash] Hash representing the Solr document
18
+ def convert_to_solr_doc
19
+ solr_doc = {}
20
+ solr_doc[:id] = druid
21
+ solr_doc[:title] = modsxml.sw_full_title
22
+ solr_doc
23
+ end
24
+
25
+ # It converts collection_druids list to a hash with names. If the druid doesn't
26
+ # have a collection name, it will be excluded from the hash
27
+ # @return [Hash] a hash for collection druid and its name
28
+ # !{"ab123cd4567"=>"Collection 1", "ef123gh4567"=>"Collection 2"}
29
+ def collection_data
30
+ @collection_data ||= collection_druids.map do |cdruid|
31
+ DiscoveryIndexer::Collection.new(cdruid)
32
+ end
33
+ end
34
+ def collection_druids
35
+ purlxml.collection_druids
36
+ end
37
+ def modsxml
38
+ @modsxml ||= DiscoveryIndexer::InputXml::Modsxml.new(druid).load
39
+ end
40
+ def purlxml
41
+ @purlxml ||= DiscoveryIndexer::InputXml::Purlxml.new(druid).load
42
+ end
43
+ end
44
+ end
File without changes
@@ -25,15 +25,6 @@ module DiscoveryIndexer
25
25
 
26
26
  modsxml_model = Stanford::Mods::Record.new
27
27
  modsxml_model.from_nk_node(@modsxml_ng_doc)
28
- modsxml_model
29
- end
30
-
31
- # loads the mods xml to stanford mods model for the fedora object defind in the druid,
32
- # it reads the mods xml from PURL server with every call
33
- # @return [Stanford::Mods::Record] represents the mods xml
34
- def reload
35
- @modsxml_ng_doc = ModsxmlReader.read(@druid)
36
- load
37
28
  end
38
29
  end
39
30
  end
@@ -19,17 +19,7 @@ module DiscoveryIndexer
19
19
  # @return [PurlxmlModel] represents the purlxml
20
20
  def load
21
21
  @purlxml_ng_doc = PurlxmlReader.read(@druid) if @purlxml_ng_doc.nil?
22
- purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc)
23
- purlxml_model = purlxml_parser.parse
24
- purlxml_model
25
- end
26
-
27
- # loads the purl xml to purlxml model for the fedora object defind in the druid
28
- # it reads the purl xml from PURL server with every call
29
- # @return [PurlxmlModel] represents the purlxml
30
- def reload
31
- @purlxml_ng_doc = PurlxmlReader.read(@druid)
32
- load
22
+ purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc).parse
33
23
  end
34
24
  end
35
25
  end
@@ -1,12 +1,17 @@
1
1
  module DiscoveryIndexer
2
2
  module InputXml
3
- class PurlxmlParserStrict < PurlxmlParser
3
+ class PurlxmlParserStrict
4
4
  include DiscoveryIndexer::Logging
5
5
 
6
6
  RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
7
7
  OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
8
8
  MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
9
9
 
10
+ def initialize(druid, purlxml_ng_doc)
11
+ @purlxml_ng_doc = purlxml_ng_doc
12
+ @druid = druid
13
+ end
14
+
10
15
  # it parses the purlxml into a purlxml model
11
16
  # @return [PurlxmlModel] represents the purlxml as parsed based on the parser rules
12
17
  def parse
@@ -117,12 +122,9 @@ module DiscoveryIndexer
117
122
  ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => 'info:fedora/fedora-system:def/relations-external#', '' => '' }
118
123
  is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
119
124
  # from public_xml rels-ext
120
- druids = []
121
- is_member_of_nodes.each do |n|
122
- druids << n.value.split('druid:').last unless n.value.empty?
125
+ is_member_of_nodes.reject { |n| n.value.empty? }.map do |n|
126
+ n.value.split('druid:').last
123
127
  end
124
- return nil if druids.empty?
125
- druids
126
128
  end
127
129
 
128
130
  # the value of the type attribute for a DOR object's contentMetadata
@@ -9,10 +9,8 @@ module DiscoveryIndexer
9
9
  # @raise [MissingPublicXml] if there's no purl xml available for this druid
10
10
  def self.read(druid)
11
11
  purlxml_uri = "#{DiscoveryIndexer::PURL_DEFAULT}/#{druid}.xml"
12
-
13
12
  begin
14
- purlxml_object = Nokogiri::XML(open(purlxml_uri))
15
- return purlxml_object
13
+ Nokogiri::XML(open(purlxml_uri))
16
14
  rescue
17
15
  raise DiscoveryIndexer::Errors::MissingPurlPage.new(purlxml_uri)
18
16
  end
@@ -1,3 +1,3 @@
1
1
  module DiscoveryIndexer
2
- VERSION = '0.10.1'
2
+ VERSION = '0.10.2'
3
3
  end
@@ -1,19 +1,16 @@
1
- require 'errors'
2
- require 'logging'
1
+ require 'discovery-indexer/errors'
2
+ require 'discovery-indexer/logging'
3
3
 
4
- require 'reader/purlxml'
5
- require 'reader/purlxml_reader'
6
- require 'reader/purlxml_parser'
7
- require 'reader/purlxml_parser_strict'
8
- require 'reader/purlxml_model'
4
+ require 'discovery-indexer/reader/purlxml'
5
+ require 'discovery-indexer/reader/purlxml_reader'
6
+ require 'discovery-indexer/reader/purlxml_parser_strict'
7
+ require 'discovery-indexer/reader/purlxml_model'
9
8
 
10
- require 'reader/modsxml'
11
- require 'reader/modsxml_reader'
9
+ require 'discovery-indexer/reader/modsxml'
10
+ require 'discovery-indexer/reader/modsxml_reader'
12
11
 
13
- require 'mapper/general_mapper'
14
-
15
- require 'writer/solr_client'
16
- require 'writer/solr_writer'
12
+ require 'discovery-indexer/general_mapper'
13
+ require 'discovery-indexer/collection'
17
14
 
18
15
  # require 'utilities/extract_sub_targets'
19
16
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discovery-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.1
4
+ version: 0.10.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ahmed AlSum
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-11-11 00:00:00.000000000 Z
12
+ date: 2015-12-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -81,6 +81,20 @@ dependencies:
81
81
  - - ">="
82
82
  - !ruby/object:Gem::Version
83
83
  version: '0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: rake
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ type: :development
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
84
98
  - !ruby/object:Gem::Dependency
85
99
  name: rspec
86
100
  requirement: !ruby/object:Gem::Requirement
@@ -145,19 +159,17 @@ extensions: []
145
159
  extra_rdoc_files: []
146
160
  files:
147
161
  - lib/discovery-indexer.rb
148
- - lib/errors.rb
149
- - lib/logging.rb
150
- - lib/mapper/general_mapper.rb
151
- - lib/reader/modsxml.rb
152
- - lib/reader/modsxml_reader.rb
153
- - lib/reader/purlxml.rb
154
- - lib/reader/purlxml_model.rb
155
- - lib/reader/purlxml_parser.rb
156
- - lib/reader/purlxml_parser_strict.rb
157
- - lib/reader/purlxml_reader.rb
158
- - lib/version.rb
159
- - lib/writer/solr_client.rb
160
- - lib/writer/solr_writer.rb
162
+ - lib/discovery-indexer/collection.rb
163
+ - lib/discovery-indexer/errors.rb
164
+ - lib/discovery-indexer/general_mapper.rb
165
+ - lib/discovery-indexer/logging.rb
166
+ - lib/discovery-indexer/reader/modsxml.rb
167
+ - lib/discovery-indexer/reader/modsxml_reader.rb
168
+ - lib/discovery-indexer/reader/purlxml.rb
169
+ - lib/discovery-indexer/reader/purlxml_model.rb
170
+ - lib/discovery-indexer/reader/purlxml_parser_strict.rb
171
+ - lib/discovery-indexer/reader/purlxml_reader.rb
172
+ - lib/discovery-indexer/version.rb
161
173
  homepage:
162
174
  licenses:
163
175
  - Stanford University
@@ -1,27 +0,0 @@
1
- module DiscoveryIndexer
2
- module Mapper
3
- class GeneralMapper
4
- # Initializes an instance from IndexMapper
5
- # @param [String] druid e.g. ab123cd4567
6
- # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
7
- # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
8
- # @param [Hash] collection_data represents a hash of collection_druid and catkey
9
- # e.g. @collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
10
- def initialize(druid, modsxml, purlxml, collection_data = {})
11
- @druid = druid
12
- @modsxml = modsxml
13
- @purlxml = purlxml
14
- @collection_data = collection_data
15
- end
16
-
17
- # Create a Hash representing a Solr doc, with all MODS related fields populated.
18
- # @return [Hash] Hash representing the Solr document
19
- def convert_to_solr_doc
20
- solr_doc = {}
21
- solr_doc[:id] = @druid
22
- solr_doc[:title] = @modsxml.sw_full_title
23
- solr_doc
24
- end
25
- end
26
- end
27
- end
@@ -1,13 +0,0 @@
1
- module DiscoveryIndexer
2
- module InputXml
3
- class PurlxmlParser
4
- def initialize(druid, purlxml_ng_doc)
5
- @purlxml_ng_doc = purlxml_ng_doc
6
- @druid = druid
7
- end
8
-
9
- def parse
10
- end
11
- end
12
- end
13
- end
@@ -1,113 +0,0 @@
1
- require 'retries'
2
- require 'rsolr'
3
- require 'rest-client'
4
- module DiscoveryIndexer
5
- module Writer
6
- # Processes adds and deletes to the solr core
7
- class SolrClient
8
- include DiscoveryIndexer::Logging
9
-
10
- # Add the document to solr, retry if an error occurs.
11
- # See https://github.com/ooyala/retries for docs on with_retries.
12
- # @param id [String] the document id, usually it will be druid.
13
- # @param solr_doc [Hash] a Hash representation of the solr document
14
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
15
- # @param max_retries [Integer] the maximum number of tries before fail
16
- def self.add(id, solr_doc, solr_connector, max_retries = 10)
17
- process(id, solr_doc, solr_connector, max_retries, false)
18
- end
19
-
20
- # Add the document to solr, retry if an error occurs.
21
- # See https://github.com/ooyala/retries for docs on with_retries.
22
- # @param id [String] the document id, usually it will be druid.
23
- # @param solr_connector[RSolr::Client] is an open connection with the solr core
24
- # @param max_retries [Integer] the maximum number of tries before fail
25
- def self.delete(id, solr_connector, max_retries = 10)
26
- process(id, {}, solr_connector, max_retries, true)
27
- end
28
-
29
- # It's an internal method that receives all the requests and deal with
30
- # SOLR core. This method can call add, delete, or update
31
- #
32
- # @param id [String] the document id, usually it will be druid.
33
- # @param solr_doc [Hash] is the solr doc in hash format
34
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
35
- # @param max_retries [Integer] the maximum number of tries before fail
36
- def self.process(id, solr_doc, solr_connector, max_retries, is_delete = false)
37
- handler = proc do |exception, attempt_number, _total_delay|
38
- DiscoveryIndexer::Logging.logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
39
- end
40
-
41
- with_retries(max_tries: max_retries, handler: handler, base_sleep_seconds: 1, max_sleep_seconds: 5) do |attempt|
42
- DiscoveryIndexer::Logging.logger.debug "Attempt #{attempt} for #{id}"
43
-
44
- if is_delete
45
- DiscoveryIndexer::Logging.logger.info "Deleting #{id} on attempt #{attempt}"
46
- solr_connector.delete_by_id(id, :add_attributes => {:commitWithin => 10000})
47
- elsif allow_update?(solr_connector) && doc_exists?(id, solr_connector)
48
- DiscoveryIndexer::Logging.logger.info "Updating #{id} on attempt #{attempt}"
49
- update_solr_doc(id, solr_doc, solr_connector)
50
- else
51
- DiscoveryIndexer::Logging.logger.info "Indexing #{id} on attempt #{attempt}"
52
- solr_connector.add(solr_doc, :add_attributes => {:commitWithin => 10000})
53
- end
54
- #solr_connector.commit
55
- DiscoveryIndexer::Logging.logger.info "Completing #{id} successfully on attempt #{attempt}"
56
- end
57
- end
58
-
59
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
60
- # @return [Boolean] true if the solr core allowing update feature
61
- def self.allow_update?(solr_connector)
62
- solr_connector.options.include?(:allow_update) ? solr_connector.options[:allow_update] : false
63
- end
64
-
65
- # @param id [String] the document id, usually it will be druid.
66
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
67
- # @return [Boolean] true if the solr doc defined by this id exists
68
- def self.doc_exists?(id, solr_connector)
69
- response = solr_connector.get 'select', params: { q: 'id:"' + id + '"' }
70
- response['response']['numFound'] == 1
71
- end
72
-
73
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
74
- # send hard commit to solr
75
- def self.commit(solr_connector)
76
- RestClient.post self.solr_url(solr_connector), {},:content_type => :json, :accept=>:json
77
- end
78
-
79
- # It is an internal method that updates the solr doc instead of adding a new one.
80
- # @param id [String] the document id, usually it will be druid.
81
- # @param solr_doc [Hash] is the solr doc in hash format
82
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
83
- def self.update_solr_doc(id, solr_doc, solr_connector)
84
- # update_solr_doc can't used RSolr because updating hash doc is not supported
85
- # so we need to build the json input manually
86
- params = "[{\"id\":\"#{id}\","
87
- solr_doc.each do |field_name, new_values|
88
- next if field_name == :id
89
- params += "\"#{field_name}\":"
90
- new_values = [new_values] unless new_values.class == Array
91
- new_values = new_values.map { |s| s.to_s.gsub('\\', '\\\\\\').gsub('"', '\"').strip } # strip leading/trailing spaces and escape quotes for each value
92
- params += "{\"set\":[\"#{new_values.join('","')}\"]},"
93
- end
94
- params.chomp!(',')
95
- params += '}]'
96
- RestClient.post self.solr_url(solr_connector), params, content_type: :json, accept: :json
97
- end
98
-
99
- # adjust the solr_url so it works with or without a trailing /
100
- # @param solr_connector [RSolr::Client] is an open connection with the solr core
101
- # @return [String] the solr URL
102
- def self.solr_url(solr_connector)
103
- solr_url = solr_connector.options[:url]
104
- if solr_url.end_with?('/')
105
- "#{solr_url}update?commit=true"
106
- else
107
- "#{solr_url}/update?commit=true"
108
- end
109
- end
110
-
111
- end
112
- end
113
- end
@@ -1,54 +0,0 @@
1
- require 'retries'
2
- require 'rsolr'
3
-
4
- module DiscoveryIndexer
5
- module Writer
6
- # Performs writes to solr client based upon true and false release flags
7
- class SolrWriter
8
- attr_reader :solr_targets_configs
9
-
10
- include DiscoveryIndexer::Logging
11
-
12
- def process(id, index_doc, targets, targets_configs)
13
- @solr_targets_configs = targets_configs
14
- index_targets = targets.select { |_, b| b }.keys
15
- delete_targets = targets.reject { |_, b| b }.keys
16
-
17
- # get targets with true
18
- solr_index_client(id, index_doc, index_targets) if index_targets.present?
19
- # get targets with false
20
- solr_delete_client(id, delete_targets) if delete_targets.present?
21
- end
22
-
23
- def solr_delete_from_all(id, targets_configs)
24
- # Get a list of all registered targets
25
- @solr_targets_configs = targets_configs
26
- targets = solr_targets_configs.keys
27
- solr_delete_client(id, targets)
28
- end
29
-
30
- def solr_index_client(id, index_doc, targets)
31
- targets.each do |solr_target|
32
- solr_connector = get_connector_for_target(solr_target)
33
- SolrClient.add(id, index_doc, solr_connector) unless solr_connector.nil?
34
- end
35
- end
36
-
37
- def solr_delete_client(id, targets)
38
- targets.each do |solr_target|
39
- solr_connector = get_connector_for_target(solr_target)
40
- SolrClient.delete(id, solr_connector) unless solr_connector.nil?
41
- end
42
- end
43
-
44
- def get_connector_for_target(solr_target)
45
- solr_connector = nil
46
- if solr_targets_configs.keys.include?(solr_target)
47
- config = solr_targets_configs[solr_target]
48
- solr_connector = RSolr.connect(config.deep_symbolize_keys)
49
- end
50
- solr_connector
51
- end
52
- end
53
- end
54
- end