harvestdor-indexer 1.0.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ module Harvestdor
2
+ ##
3
+ # Harvest metrics tracker
4
+ class Indexer::Metrics
5
+ attr_accessor :error_count, :success_count, :logger
6
+
7
+ def initialize options = {}
8
+ @success_count=0 # the number of objects successfully indexed
9
+ @error_count=0 # the number of objects that failed
10
+ @logger = options[:logger] || Logger.new(STDERR)
11
+ end
12
+
13
+ ##
14
+ # Wrap an operation in tally block; if the block completes without throwing
15
+ # an exception, tally a success. If the block throws an exception, catch it
16
+ # and tally a failure.
17
+ #
18
+ # Callers can provide an :on_error handler to receive the exception and process
19
+ # it appropriately.
20
+ #
21
+ # @param [Hash] options
22
+ # @option options [#call] Callback that will receive any exception thrown by the block
23
+ def tally options = {}, &block
24
+ begin
25
+ block.call
26
+ success!
27
+ rescue => e
28
+ error!
29
+ logger.error "Failed to process: #{e.message}"
30
+ options[:on_error].call e if options[:on_error]
31
+ end
32
+ end
33
+
34
+ ##
35
+ # Record a successful run
36
+ def success!
37
+ @success_count += 1
38
+ end
39
+
40
+ ##
41
+ # Record an error
42
+ def error!
43
+ @error_count += 1
44
+ end
45
+
46
+ ##
47
+ # Total number of runs
48
+ def total
49
+ @success_count + @error_count
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,174 @@
1
+ require 'active_support/benchmarkable'
2
+
3
+ module Harvestdor
4
+ class Indexer::Resource
5
+ include ActiveSupport::Benchmarkable
6
+
7
+ attr_reader :indexer, :druid, :options
8
+
9
+ def initialize indexer, druid, options = {}
10
+ @indexer = indexer
11
+ @druid = druid
12
+ @options = options
13
+ end
14
+
15
+ def bare_druid
16
+ @bare_druid ||= druid.gsub("druid:", "")
17
+ end
18
+
19
+ ##
20
+ # The harvestdor client used for retrieving resources
21
+ def harvestdor_client
22
+ indexer.harvestdor_client
23
+ end
24
+
25
+ def dor_fetcher_client
26
+ indexer.dor_fetcher_client
27
+ end
28
+
29
+ ##
30
+ # Get the logger
31
+ def logger
32
+ options[:logger] || (indexer.logger if indexer.respond_to? :logger) || Logger.new(STDERR)
33
+ end
34
+
35
+ ##
36
+ # Is this resource a collection?
37
+ def collection?
38
+ identity_metadata.xpath("/identityMetadata/objectType").any? { |x| x.text == "collection" }
39
+ end
40
+
41
+ # get the druids from isMemberOfCollection relationships in rels-ext from public_xml
42
+ # @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
43
+ def collections
44
+ @collections ||= begin
45
+ ns_hash = {'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => "info:fedora/fedora-system:def/relations-external#", '' => ''}
46
+ is_member_of_nodes ||= public_xml.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
47
+
48
+ is_member_of_nodes.reject { |n| n.value.empty? }.map do |n|
49
+ Harvestdor::Indexer::Resource.new(indexer, n.value.gsub("info:fedora/", ""))
50
+ end
51
+ end
52
+ end
53
+
54
+ ##
55
+ # Return the items in this collection
56
+ def items
57
+ @items ||= begin
58
+ druids = dor_fetcher_client.druid_array(dor_fetcher_client.get_collection(bare_druid, {}))
59
+ druids.map { |x| Harvestdor::Indexer::Resource.new(indexer, x) }
60
+ end
61
+ end
62
+
63
+ # given a druid, get its objectLabel from its purl page identityMetadata
64
+ # @param [String] druid, e.g. ab123cd4567
65
+ # @return [String] the value of the <objectLabel> element in the identityMetadata for the object
66
+ def identity_md_obj_label
67
+ logger.error("#{druid} missing identityMetadata") unless identity_metadata
68
+ identity_metadata.xpath('identityMetadata/objectLabel').text
69
+ end
70
+
71
+
72
+ # return the MODS for the druid as a Stanford::Mods::Record object
73
+ # @param [String] druid e.g. ab123cd4567
74
+ # @return [Stanford::Mods::Record] created from the MODS xml for the druid
75
+ def smods_rec
76
+ @smods_rec ||= benchmark "smods_rec(#{druid})", level: :debug do
77
+ ng_doc = mods
78
+ raise "Empty MODS metadata for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
79
+ mods_rec = Stanford::Mods::Record.new
80
+ mods_rec.from_nk_node(ng_doc.root)
81
+ mods_rec
82
+ end
83
+ end
84
+
85
+ def mods
86
+ @mods ||= harvestdor_client.mods bare_druid
87
+ end
88
+
89
+ # the public xml for this DOR object, from the purl page
90
+ # @param [String] druid e.g. ab123cd4567
91
+ # @return [Nokogiri::XML::Document] the public xml for the DOR object
92
+ def public_xml
93
+ @public_xml ||= benchmark "public_xml(#{druid})", level: :debug do
94
+ ng_doc = harvestdor_client.public_xml bare_druid
95
+ raise "No public xml for #{druid}" if !ng_doc
96
+ raise "Empty public xml for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
97
+ ng_doc
98
+ end
99
+ end
100
+
101
+ ##
102
+ # Has the public_xml been previously retrieved?
103
+ def public_xml?
104
+ !!@public_xml
105
+ end
106
+
107
+ ##
108
+ # Get the public_xml, if retrieved, or the druid. This is used to short-circuit
109
+ # retrieving metadata out of the public xml.
110
+ def public_xml_or_druid
111
+ if public_xml?
112
+ public_xml
113
+ else
114
+ bare_druid
115
+ end
116
+ end
117
+
118
+ # the contentMetadata for this DOR object, ultimately from the purl public xml
119
+ # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
120
+ # a Nokogiri::XML::Document containing the public_xml for an object
121
+ # @return [Nokogiri::XML::Document] the contentMetadata for the DOR object
122
+ def content_metadata
123
+ ng_doc = benchmark "content_metadata (#{druid})", level: :debug do
124
+ harvestdor_client.content_metadata public_xml_or_druid
125
+ end
126
+ raise "No contentMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
127
+ ng_doc
128
+ end
129
+
130
+ # the identityMetadata for this DOR object, ultimately from the purl public xml
131
+ # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
132
+ # a Nokogiri::XML::Document containing the public_xml for an object
133
+ # @return [Nokogiri::XML::Document] the identityMetadata for the DOR object
134
+ def identity_metadata
135
+ ng_doc = benchmark "identity_metadata (#{druid})", level: :debug do
136
+ harvestdor_client.identity_metadata public_xml_or_druid
137
+ end
138
+ raise "No identityMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
139
+ ng_doc
140
+ end
141
+
142
+ # the rightsMetadata for this DOR object, ultimately from the purl public xml
143
+ # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
144
+ # a Nokogiri::XML::Document containing the public_xml for an object
145
+ # @return [Nokogiri::XML::Document] the rightsMetadata for the DOR object
146
+ def rights_metadata
147
+ ng_doc = benchmark "rights_metadata (#{druid})", level: :debug do
148
+ harvestdor_client.rights_metadata public_xml_or_druid
149
+ end
150
+ raise "No rightsMetadata for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
151
+ ng_doc
152
+ end
153
+
154
+ # the RDF for this DOR object, ultimately from the purl public xml
155
+ # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
156
+ # a Nokogiri::XML::Document containing the public_xml for an object
157
+ # @return [Nokogiri::XML::Document] the RDF for the DOR object
158
+ def rdf
159
+ ng_doc = benchmark "rdf (#{druid})", level: :debug do
160
+ harvestdor_client.rdf public_xml_or_druid
161
+ end
162
+ raise "No RDF for \"#{druid}\"" if !ng_doc || ng_doc.children.empty?
163
+ ng_doc
164
+ end
165
+
166
+ def eql?(other)
167
+ other.is_a? Harvestdor::Indexer::Resource and other.indexer == indexer and other.druid == druid
168
+ end
169
+
170
+ def hash
171
+ druid.hash ^ indexer.hash
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,39 @@
1
+ module Harvestdor
2
+ class Indexer::Solr
3
+ attr_accessor :client, :indexer, :config
4
+
5
+ def initialize indexer, config = {}
6
+ @indexer = indexer
7
+ @client = RSolr.connect(config)
8
+ @config = Confstruct::Configuration.new config
9
+ @config.max_retries ||= 10
10
+ end
11
+
12
+ def logger
13
+ indexer.logger
14
+ end
15
+
16
+ def commit!
17
+ client.commit
18
+ end
19
+
20
+ # Add the document to solr, retry if an error occurs.
21
+ # See https://github.com/ooyala/retries for docs on with_retries.
22
+ # @param [Hash] doc a Hash representation of the solr document
23
+ def add(doc)
24
+ id = doc[:id]
25
+
26
+ handler = Proc.new do |exception, attempt_number, total_delay|
27
+ logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
28
+ # logger.debug exception.backtrace
29
+ end
30
+
31
+ with_retries(:max_tries => config.max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
32
+ logger.debug "Attempt #{attempt} for #{id}"
33
+ client.add(doc)
34
+ logger.info "Successfully indexed #{id} on attempt #{attempt}"
35
+ end
36
+ end
37
+
38
+ end
39
+ end
@@ -1,6 +1,6 @@
1
1
  module Harvestdor
2
2
  class Indexer
3
3
  # this is the Ruby Gem version
4
- VERSION = "1.0.4"
4
+ VERSION = "2.0.0"
5
5
  end
6
6
  end
@@ -1,28 +1,6 @@
1
- # You will want to copy this file and change the following settings:
2
- # 1. log_dir, log_name
3
- # 2. default_set
4
- # 3. blacklist or whitelist if you are using them
5
- # 4. Solr baseurl
6
-
7
- # log_dir: directory for log file (default logs, relative to harvestdor gem path)
8
- log_dir: spec/test_logs
9
-
10
- # log_name: name of log file (default: harvestdor.log)
11
- log_name: ap-test.log
12
-
13
- # purl: url for the DOR purl server (used to get ContentMetadata, etc.)
14
- purl: http://purl.stanford.edu
15
-
16
- # ---------- White and Black list parameters -----
17
-
18
- # name of file containing druids that will NOT be processed even if they are harvested
19
- # via DorFetcher either give absolute path or path relative to where the command will
20
- # be executed
21
- #blacklist: config/ap_blacklist.txt
22
-
23
- # name of file containing druids that WILL be processed (all others will be ignored)
24
- # either give absolute path or path relative to where the command will be executed
25
- #whitelist: config/ap_whitelist.txt
1
+ dor_fetcher:
2
+ service_url: http://127.0.0.1:3000
3
+ skip_heartbeat: true
26
4
 
27
5
  # ----------- SOLR index (that we're writing INTO) parameters ------------
28
6
  solr:
@@ -31,25 +9,35 @@ solr:
31
9
  # timeouts are in seconds; read_timeout -> open/read, open_timeout -> connection open
32
10
  read_timeout: 60
33
11
  open_timeout: 60
12
+ max_retries: 10
34
13
 
35
- # default_set: default set for harvest (default: nil)
36
- # can be overridden on calls to harvest_ids and harvest_records
37
- default_set: is_governed_by_yg867hg1375
38
-
39
- # default_metadata_prefix: default metadata prefix to be used for harvesting (default: mods)
40
- # can be overridden on calls to harvest_ids and harvest_records
41
-
42
- # default_from_date: default from date for harvest (default: nil)
43
- # can be overridden on calls to harvest_ids and harvest_records
14
+ # ---------- White and Black list parameters -----
44
15
 
45
- # default_until_date: default until date for harvest (default: nil)
46
- # can be overridden on calls to harvest_ids and harvest_records
16
+ # name of file containing druids that WILL be processed (all others will be ignored)
17
+ # either give absolute path or path relative to where the command will be executed
18
+ #whitelist: config/ap_whitelist.txt
47
19
 
48
- # Additional options to pass to Faraday http client (https://github.com/technoweenie/faraday)
49
- http_options:
50
- ssl:
51
- verify: false
52
- # timeouts are in seconds; timeout -> open/read, open_timeout -> connection open
53
- request:
54
- timeout: 180
55
- open_timeout: 180
20
+ harvestdor:
21
+ # You will want to copy this file and change the following settings:
22
+ # 1. log_dir, log_name
23
+ # 2. default_set
24
+ # 3. blacklist or whitelist if you are using them
25
+ # 4. Solr baseurl
26
+
27
+ # log_dir: directory for log file (default logs, relative to harvestdor gem path)
28
+ log_dir: spec/test_logs
29
+
30
+ # log_name: name of log file (default: harvestdor.log)
31
+ log_name: ap-test.log
32
+
33
+ # purl: url for the DOR purl server (used to get ContentMetadata, etc.)
34
+ purl: http://purl.stanford.edu
35
+
36
+ # Additional options to pass to Faraday http client (https://github.com/technoweenie/faraday)
37
+ http_options:
38
+ ssl:
39
+ verify: false
40
+ # timeouts are in seconds; timeout -> open/read, open_timeout -> connection open
41
+ request:
42
+ timeout: 180
43
+ open_timeout: 180
@@ -53,4 +53,100 @@ http_interactions:
53
53
  social customs, and people."}],"counts":{"collections":1,"items":5,"total_count":6}}'
54
54
  http_version:
55
55
  recorded_at: Wed, 12 Nov 2014 19:34:03 GMT
56
+ - request:
57
+ method: get
58
+ uri: http://purl.stanford.edu/yg867hg1375.xml
59
+ body:
60
+ encoding: US-ASCII
61
+ string: ''
62
+ headers:
63
+ Accept-Encoding:
64
+ - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
65
+ Accept:
66
+ - "*/*"
67
+ User-Agent:
68
+ - Ruby
69
+ response:
70
+ status:
71
+ code: 200
72
+ message: ''
73
+ headers:
74
+ Date:
75
+ - Wed, 17 Dec 2014 19:39:37 GMT
76
+ Server:
77
+ - Apache/2.2.15 (Red Hat)
78
+ X-Powered-By:
79
+ - Phusion Passenger (mod_rails/mod_rack) 3.0.19
80
+ X-Ua-Compatible:
81
+ - IE=Edge,chrome=1
82
+ Etag:
83
+ - '"67aa6d1481ba1537ae63af5aaf493f84"'
84
+ Cache-Control:
85
+ - max-age=0, private, must-revalidate
86
+ X-Request-Id:
87
+ - f2e753d56bf896cde6e941be0f51d05a
88
+ X-Runtime:
89
+ - '0.007983'
90
+ X-Rack-Cache:
91
+ - miss
92
+ Status:
93
+ - '200'
94
+ Content-Length:
95
+ - '2180'
96
+ Content-Type:
97
+ - application/xml; charset=utf-8
98
+ body:
99
+ encoding: UTF-8
100
+ string: |
101
+ <publicObject id="druid:yg867hg1375" published="2013-11-11T15:34:32-08:00">
102
+ <identityMetadata>
103
+ <objectId>druid:yg867hg1375</objectId>
104
+ <objectCreator>DOR</objectCreator>
105
+ <objectLabel>Francis E. Stafford photographs, 1909-1933</objectLabel>
106
+ <objectType>collection</objectType>
107
+ <adminPolicy>druid:vb546ms7107</adminPolicy>
108
+ <otherId name="catkey">9615156</otherId>
109
+ <otherId name="uuid">8f1feb20-4b29-11e3-8e31-0050569b3c3c</otherId>
110
+ <tag>Remediated By : 3.25.3</tag>
111
+ </identityMetadata>
112
+ <xml/>
113
+ <rightsMetadata>
114
+ <access type="discover">
115
+ <machine>
116
+ <world/>
117
+ </machine>
118
+ </access>
119
+ <access type="read">
120
+ <machine>
121
+ <world/>
122
+ </machine>
123
+ </access>
124
+ <use>
125
+ <human type="useAndReproduction"/>
126
+ <human type="creativeCommons"/>
127
+ <machine type="creativeCommons"/>
128
+ </use>
129
+ <copyright>
130
+ <human/>
131
+ </copyright>
132
+ </rightsMetadata>
133
+ <rdf:RDF xmlns:fedora-model="info:fedora/fedora-system:def/model#" xmlns:hydra="http://projecthydra.org/ns/relations#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
134
+ <rdf:Description rdf:about="info:fedora/druid:yg867hg1375">
135
+ </rdf:Description>
136
+ </rdf:RDF>
137
+ <oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:srw_dc="info:srw/schema/1/dc-schema" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
138
+ <dc:title>Francis E. Stafford photographs, 1909-1933</dc:title>
139
+ <dc:contributor>Stafford, Francis E., 1884-1938</dc:contributor>
140
+ <dc:type>Collection</dc:type>
141
+ <dc:date>1909-1933</dc:date>
142
+ <dc:language>und</dc:language>
143
+ <dc:format>3 oversize boxes.</dc:format>
144
+ <dc:description>Photographs of scenes in China, mainly between 1909 and 1915.</dc:description>
145
+ <dc:rights>Closed. Digital use copies available.</dc:rights>
146
+ <dc:description type="biographical/historical">American missionary in China, 1909-1915 and 1932-1933.</dc:description>
147
+ <dc:coverage>China</dc:coverage>
148
+ </oai_dc:dc>
149
+ </publicObject>
150
+ http_version:
151
+ recorded_at: Wed, 17 Dec 2014 19:39:38 GMT
56
152
  recorded_with: VCR 2.9.3